From 2f60cb0dbb62c74bab0d9c4756b1b0e92da4bcdd Mon Sep 17 00:00:00 2001
From: Brad Pepers <brad.pepers@amd.com>
Date: Sat, 9 May 2026 07:08:04 -0600
Subject: [PATCH 01/11] Create common_utils shared utility library and move
 pure utilities

Introduce a header-only common_utils library for pure C++ utilities
shared by MIOpen library, MIOpenDriver, and tests. This is the first
step toward a layered architecture that eliminates circular dependencies
between driver, test, and library code.

Move 9 utility headers from src/include/miopen/ to
common_utils/include/common_utils/:
- rank.hpp, returns.hpp, algorithm.hpp (zero-dependency)
- float_equal.hpp, each_args.hpp, type_name.hpp, par_for.hpp
- functional.hpp, ford.hpp (depend on other moved utilities)

Original locations retain thin forwarding headers for backward
compatibility. All internal cross-references within moved headers
updated to use common_utils/ paths.

CMake: common_utils added as INTERFACE library, linked by MIOpen,
MIOpenDriver, and test targets.

Co-Authored-By: Claude Sonnet 4 <noreply@anthropic.com>
---
 projects/miopen/CMakeLists.txt                |   1 +
 projects/miopen/common_utils/CMakeLists.txt   |  35 ++++
 .../include/common_utils/algorithm.hpp        |  47 ++++++
 .../include/common_utils/each_args.hpp        |  79 ++++++++++
 .../include/common_utils/float_equal.hpp      |  89 +++++++++++
 .../include/common_utils/ford.hpp             | 122 ++++++++++++++
 .../include/common_utils/functional.hpp       | 131 +++++++++++++++
 .../include/common_utils/par_for.hpp          | 149 ++++++++++++++++++
 .../include/common_utils/rank.hpp             |  42 +++++
 .../include/common_utils/returns.hpp          |  38 +++++
 .../include/common_utils/type_name.hpp        | 139 ++++++++++++++++
 projects/miopen/driver/CMakeLists.txt         |   2 +-
 projects/miopen/src/CMakeLists.txt            |   2 +-
 .../miopen/src/include/miopen/algorithm.hpp   |  21 +--
 .../miopen/src/include/miopen/each_args.hpp   |  53 +------
 .../miopen/src/include/miopen/float_equal.hpp |  63 +-------
 projects/miopen/src/include/miopen/ford.hpp   | 121 +-------------
 .../miopen/src/include/miopen/functional.hpp  | 130 +--------------
 .../miopen/src/include/miopen/par_for.hpp     | 123 +--------------
 projects/miopen/src/include/miopen/rank.hpp   |  16 +-
 .../miopen/src/include/miopen/returns.hpp     |  12 +-
 .../miopen/src/include/miopen/type_name.hpp   | 113 +------------
 projects/miopen/test/CMakeLists.txt           |   4 +-
 23 files changed, 894 insertions(+), 638 deletions(-)
 create mode 100644 projects/miopen/common_utils/CMakeLists.txt
 create mode 100644 projects/miopen/common_utils/include/common_utils/algorithm.hpp
 create mode 100644 projects/miopen/common_utils/include/common_utils/each_args.hpp
 create mode 100644 projects/miopen/common_utils/include/common_utils/float_equal.hpp
 create mode 100644 projects/miopen/common_utils/include/common_utils/ford.hpp
 create mode 100644 projects/miopen/common_utils/include/common_utils/functional.hpp
 create mode 100644 projects/miopen/common_utils/include/common_utils/par_for.hpp
 create mode 100644 projects/miopen/common_utils/include/common_utils/rank.hpp
 create mode 100644 projects/miopen/common_utils/include/common_utils/returns.hpp
 create mode 100644 projects/miopen/common_utils/include/common_utils/type_name.hpp

diff --git a/projects/miopen/CMakeLists.txt b/projects/miopen/CMakeLists.txt
index af87cd1c7e16..627ddec85bbd 100644
--- a/projects/miopen/CMakeLists.txt
+++ b/projects/miopen/CMakeLists.txt
@@ -894,6 +894,7 @@ set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/bin)
 if(NOT MIOPEN_USE_SQLITE_PERFDB)
     add_subdirectory(tools/sqlite2txt)
 endif()
+add_subdirectory(common_utils)
 add_subdirectory(addkernels)
 add_subdirectory(src)
 if(MIOPEN_BUILD_DRIVER)
diff --git a/projects/miopen/common_utils/CMakeLists.txt b/projects/miopen/common_utils/CMakeLists.txt
new file mode 100644
index 000000000000..c0f4620a3439
--- /dev/null
+++ b/projects/miopen/common_utils/CMakeLists.txt
@@ -0,0 +1,35 @@
+################################################################################
+#
+# MIT License
+#
+# Copyright (c) 2025 Advanced Micro Devices, Inc.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+################################################################################
+
+# Header-only utility library shared by MIOpen, MIOpenDriver, and tests.
+# Contains pure C++ utilities with NO MIOpen or GPU dependencies.
+
+add_library(miopen_common_utils INTERFACE)
+add_library(MIOpen::common_utils ALIAS miopen_common_utils)
+
+target_include_directories(miopen_common_utils INTERFACE
+    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
+)
diff --git a/projects/miopen/common_utils/include/common_utils/algorithm.hpp b/projects/miopen/common_utils/include/common_utils/algorithm.hpp
new file mode 100644
index 000000000000..d1098a066077
--- /dev/null
+++ b/projects/miopen/common_utils/include/common_utils/algorithm.hpp
@@ -0,0 +1,47 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2019 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef GUARD_MLOPEN_ALGORITHM_HPP
+#define GUARD_MLOPEN_ALGORITHM_HPP
+
+#include <algorithm>
+
+namespace miopen {
+
+template <typename Range, typename Predicate>
+bool any_of(const Range& r, Predicate p)
+{
+    return std::any_of(r.begin(), r.end(), p);
+}
+
+template <typename Range, typename Predicate>
+bool all_of(const Range& r, Predicate p)
+{
+    return std::all_of(r.begin(), r.end(), p);
+}
+
+} // namespace miopen
+
+#endif
diff --git a/projects/miopen/common_utils/include/common_utils/each_args.hpp b/projects/miopen/common_utils/include/common_utils/each_args.hpp
new file mode 100644
index 000000000000..e078153dc998
--- /dev/null
+++ b/projects/miopen/common_utils/include/common_utils/each_args.hpp
@@ -0,0 +1,79 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2017 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef GUARD_MIOPEN_EACH_ARGS_HPP
+#define GUARD_MIOPEN_EACH_ARGS_HPP
+
+#include <initializer_list>
+#include <type_traits>
+#include <utility>
+
+namespace miopen {
+namespace detail {
+
+template <class F, std::size_t... Ns, class... Ts>
+void each_args_i_impl(F f, std::index_sequence<Ns...>, Ts&&... xs)
+{
+    (void)std::initializer_list<int>{
+        (f(std::integral_constant<std::size_t, Ns>{}, std::forward<Ts>(xs)), 0)...};
+}
+
+template <class F, std::size_t... Ns, class T>
+auto unpack_impl(F f, std::index_sequence<Ns...>, T&& x)
+{
+    return f(std::get<Ns>(x)...);
+}
+
+} // namespace detail
+
+template <class F, class... Ts>
+void each_args_i(F f, Ts&&... xs)
+{
+    detail::each_args_i_impl(f, std::make_index_sequence<sizeof...(Ts)>(), std::forward<Ts>(xs)...);
+}
+
+template <class F, class... Ts>
+void each_args(F f, Ts&&... xs)
+{
+    (void)std::initializer_list<int>{(f(std::forward<Ts>(xs)), 0)...};
+}
+
+// Workaround for gcc warnings
+template <class F>
+void each_args(F)
+{
+}
+
+template <class F, std::size_t... Ns, class T>
+auto unpack(F f, T&& x)
+{
+    using type = typename std::remove_cv<typename std::remove_reference<T>::type>::type;
+    return detail::unpack_impl(
+        f, std::make_index_sequence<std::tuple_size<type>::value>(), std::forward<T>(x));
+}
+
+} // namespace miopen
+
+#endif
diff --git a/projects/miopen/common_utils/include/common_utils/float_equal.hpp b/projects/miopen/common_utils/include/common_utils/float_equal.hpp
new file mode 100644
index 000000000000..24bbdc55ad11
--- /dev/null
+++ b/projects/miopen/common_utils/include/common_utils/float_equal.hpp
@@ -0,0 +1,89 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2017 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef GUARD_MLOPEN_FLOAT_EQUAL_HPP
+#define GUARD_MLOPEN_FLOAT_EQUAL_HPP
+
+#include <algorithm>
+#include <cmath>
+#include <limits>
+#include <numeric>
+
+namespace miopen {
+
+template <class... Ts>
+using common_type = typename std::common_type<Ts...>::type;
+
+struct float_equal_fn
+{
+    template <class T>
+    static bool apply(T x, T y)
+    {
+        // The standard library from MSVC does not implement std::isfinite() for integer
+        // types - no additional overloads are provided. According to the documentation,
+        // integer types should be treaded as doubles.
+        // Refer to https://en.cppreference.com/w/cpp/numeric/math/isfinite for more information.
+        return std::isfinite(static_cast<double>(x)) and std::isfinite(static_cast<double>(y)) and
+               std::nextafter(x, std::numeric_limits<T>::lowest()) <= y and
+               std::nextafter(x, std::numeric_limits<T>::max()) >= y;
+    }
+
+    template <class T, class U>
+    bool operator()(T x, U y) const
+    {
+        return float_equal_fn::apply<common_type<T, U>>(x, y);
+    }
+};
+
+static constexpr float_equal_fn float_equal{};
+
+/// Special case for comparing with a sentinel value
+struct float_equal_sentinel_fn
+{
+    template <class T>
+    static bool apply(T x, T y)
+    {
+// In this case we have to ignore this warning, because we intend to compare with the exact value
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wfloat-equal"
+        bool equals_sentinel = x == y;
+#pragma clang diagnostic pop
+
+        return std::isfinite(static_cast<double>(x)) and std::isfinite(static_cast<double>(y)) and
+               equals_sentinel;
+    }
+
+    template <class T, class U>
+    bool operator()(T x, U y) const
+    {
+        return float_equal_sentinel_fn::apply<common_type<T, U>>(x, y);
+    }
+};
+
+static constexpr float_equal_sentinel_fn float_equal_sentinel{};
+
+} // namespace miopen
+
+#endif
diff --git a/projects/miopen/common_utils/include/common_utils/ford.hpp b/projects/miopen/common_utils/include/common_utils/ford.hpp
new file mode 100644
index 000000000000..4ff4ddfa32e2
--- /dev/null
+++ b/projects/miopen/common_utils/include/common_utils/ford.hpp
@@ -0,0 +1,122 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2017 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef GUARD_FORD_HPP
+#define GUARD_FORD_HPP
+
+#include <algorithm>
+#include <array>
+#include <cassert>
+#include <cmath>
+#include <functional>
+#include <common_utils/par_for.hpp>
+#include <common_utils/each_args.hpp>
+#include <common_utils/returns.hpp>
+#include <numeric>
+#include <vector>
+
+#include <thread>
+
+#include <future>
+
+namespace miopen {
+
+// An improved async, that doesn't block
+template <class Function>
+std::future<typename std::invoke_result<Function>::type> detach_async(Function&& f)
+{
+    using result_type = typename std::invoke_result<Function>::type;
+    std::packaged_task<result_type()> task(std::forward<Function>(f));
+    auto fut = task.get_future();
+    std::thread(std::move(task)).detach();
+    return fut;
+}
+
+template <class T, class Work>
+auto then(std::future<T> f, Work w) -> std::future<decltype(w(f.get()))>
+{
+    return std::async(std::launch::deferred,
+                      [=, f_ = std::move(f)]() mutable { return w(f_.get()); });
+}
+
+template <class T>
+struct ford_wrapper
+{
+    template <class... Ts>
+    auto operator()(Ts... xs) const MIOPEN_RETURNS(std::bind(T{}, std::placeholders::_1, xs...));
+};
+
+// Multidimensional for loop
+struct ford_impl
+{
+    template <class F>
+    void operator()(F f) const
+    {
+        f();
+    }
+
+    template <class F, class T, class... Ts>
+    void operator()(F f, T x, Ts... xs) const
+    {
+        // Workaround for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=55914
+        for(T i = 0; i < x; i++)
+        {
+            (*this)([&](Ts... is) { f(i, is...); }, xs...);
+        }
+    }
+};
+
+static constexpr ford_wrapper<ford_impl> ford{};
+
+struct par_ford_impl
+{
+    template <class F, class... Ts>
+    void operator()(F f, Ts... xs) const
+    {
+        using array_type = std::array<std::size_t, sizeof...(Ts)>;
+        array_type lens  = {{static_cast<std::size_t>(xs)...}};
+        array_type strides;
+        strides.fill(1);
+        std::partial_sum(
+            lens.rbegin(), lens.rend() - 1, strides.rbegin() + 1, std::multiplies<std::size_t>());
+        auto size = std::accumulate(
+            lens.begin(), lens.end(), static_cast<std::size_t>(1), std::multiplies<std::size_t>());
+        par_for(size, [&](std::size_t i) {
+            array_type indices;
+            std::transform(strides.begin(),
+                           strides.end(),
+                           lens.begin(),
+                           indices.begin(),
+                           [&](size_t stride, size_t len) { return (i / stride) % len; });
+            unpack(f, indices);
+        });
+    }
+};
+
+static constexpr ford_wrapper<par_ford_impl> par_ford{};
+
+} // namespace miopen
+
+#endif
diff --git a/projects/miopen/common_utils/include/common_utils/functional.hpp b/projects/miopen/common_utils/include/common_utils/functional.hpp
new file mode 100644
index 000000000000..19dde2bd28dc
--- /dev/null
+++ b/projects/miopen/common_utils/include/common_utils/functional.hpp
@@ -0,0 +1,131 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2017 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef GUARD_MLOPEN_FUNCTIONAL_HPP
+#define GUARD_MLOPEN_FUNCTIONAL_HPP
+
+#include <common_utils/each_args.hpp>
+#include <common_utils/returns.hpp>
+#include <utility>
+
+namespace miopen {
+namespace detail {
+
+template <class F, std::size_t... Ns>
+auto each_i_impl(F f, std::index_sequence<Ns...>)
+    MIOPEN_RETURNS(f(std::integral_constant<std::size_t, Ns>{}...));
+} // namespace detail
+
+template <class F, class P>
+struct by_t
+{
+    F f;
+    P p;
+    template <class... Ts>
+    auto operator()(Ts&&... xs) const MIOPEN_RETURNS(f(p(std::forward<Ts>(xs))...))
+};
+
+template <class F, class P>
+by_t<F, P> by(F f, P p)
+{
+    return {std::move(f), std::move(p)};
+}
+
+template <class F, class G>
+struct compose_t
+{
+    F f;
+    G g;
+    template <class... Ts>
+    auto operator()(Ts&&... xs) const MIOPEN_RETURNS(f(g(std::forward<Ts>(xs)...)))
+};
+
+template <class F, class G>
+compose_t<F, G> compose(F f, G g)
+{
+    return {std::move(f), std::move(g)};
+}
+
+template <class F>
+struct flip_t
+{
+    F f;
+    template <class T, class U>
+    auto operator()(T&& x, U&& y) const MIOPEN_RETURNS(f(std::forward<U>(y), std::forward<T>(x)))
+};
+
+template <class F>
+flip_t<F> flip(F f)
+{
+    return {std::move(f)};
+}
+
+template <class F>
+struct sequence_t
+{
+    F f;
+    template <class IntegralConstant>
+    auto operator()(IntegralConstant) const
+        MIOPEN_RETURNS(detail::each_i_impl(f, std::make_index_sequence<IntegralConstant::value>()));
+};
+
+template <class F>
+sequence_t<F> sequence(F f)
+{
+    return {std::move(f)};
+}
+
+template <typename F, std::size_t N>
+void repeat_n(F f, std::integral_constant<std::size_t, N>)
+{
+    auto fs = [&f](auto... is) { return each_args(f, is...); };
+    sequence(fs)(std::integral_constant<std::size_t, N>{});
+}
+
+template <class T>
+struct cast_to
+{
+    template <class X>
+    T operator()(X&& x) const
+    {
+        return static_cast<T>(std::forward<X>(x));
+    }
+};
+
+template <class F>
+auto unpacker(F f)
+{
+    return [=](auto xs) { return miopen::unpack(f, xs); };
+};
+
+template <class F, class... Xs>
+auto prepender(F f, Xs... xs)
+{
+    return [=](auto... ys) { return f(xs..., ys...); };
+}
+
+} // namespace miopen
+
+#endif
diff --git a/projects/miopen/common_utils/include/common_utils/par_for.hpp b/projects/miopen/common_utils/include/common_utils/par_for.hpp
new file mode 100644
index 000000000000..1272dcf6ac9b
--- /dev/null
+++ b/projects/miopen/common_utils/include/common_utils/par_for.hpp
@@ -0,0 +1,149 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2020 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#ifndef MIOPEN_GUARD_MLOPEN_PAR_FOR_HPP
+#define MIOPEN_GUARD_MLOPEN_PAR_FOR_HPP
+
+#include <algorithm>
+#include <cassert>
+#include <cmath>
+#include <functional>
+#include <numeric>
+#include <vector>
+
+#include <thread>
+
+namespace miopen {
+
+struct joinable_thread : std::thread
+{
+    template <class... Xs>
+    joinable_thread(Xs&&... xs) : std::thread(std::forward<Xs>(xs)...) // NOLINT
+    {
+    }
+
+    joinable_thread& operator=(joinable_thread&& other) = default;
+    joinable_thread(joinable_thread&& other)            = default;
+
+    ~joinable_thread()
+    {
+        if(this->joinable())
+            this->join();
+    }
+};
+
+struct thread_factory
+{
+    template <class F>
+    joinable_thread operator()(std::size_t& work, std::size_t n, std::size_t grainsize, F f) const
+    {
+        auto result = joinable_thread([=] {
+            std::size_t start = work;
+            std::size_t last  = std::min(n, work + grainsize);
+            for(std::size_t i = start; i < last; i++)
+            {
+                f(i);
+            }
+        });
+        work += grainsize;
+        return result;
+    }
+};
+
+template <class F>
+void par_for_impl(std::size_t n, std::size_t threadsize, F f)
+{
+    if(threadsize <= 1)
+    {
+        for(std::size_t i = 0; i < n; i++)
+            f(i);
+    }
+    else
+    {
+        std::vector<joinable_thread> threads(threadsize);
+        const std::size_t grainsize = std::ceil(static_cast<double>(n) / threads.size());
+
+        std::size_t work = 0;
+        std::generate(threads.begin(),
+                      threads.end(),
+                      std::bind(thread_factory{}, std::ref(work), n, grainsize, f));
+        assert(work >= n);
+    }
+}
+
+template <class F>
+void par_for(std::size_t n, std::size_t min_grain, F f)
+{
+    const auto threadsize =
+        std::min<std::size_t>(std::thread::hardware_concurrency(), n / min_grain);
+    par_for_impl(n, threadsize, f);
+}
+
+struct min_grain
+{
+    std::size_t n = 0;
+};
+
+template <class F>
+void par_for(std::size_t n, min_grain mg, F f)
+{
+    const auto threadsize = std::min<std::size_t>(std::thread::hardware_concurrency(), n / mg.n);
+    par_for_impl(n, threadsize, f);
+}
+
+template <class F>
+void par_for(std::size_t n, F f)
+{
+    par_for(n, min_grain{8}, f);
+}
+
+struct max_threads
+{
+    std::size_t n = 0;
+};
+
+template <class F>
+void par_for(std::size_t n, max_threads mt, F f)
+{
+    const auto threadsize = std::min<std::size_t>(std::thread::hardware_concurrency(), mt.n);
+    par_for_impl(n, std::min(threadsize, n), f);
+}
+
+template <class F>
+void par_for_strided(std::size_t n, max_threads mt, F f)
+{
+    auto threadsize = std::min<std::size_t>(std::thread::hardware_concurrency(), mt.n);
+    par_for_impl(threadsize, threadsize, [&](auto start) {
+        for(std::size_t i = start; i < n; i += threadsize)
+        {
+            f(i);
+        }
+    });
+}
+
+} // namespace miopen
+
+#endif
diff --git a/projects/miopen/common_utils/include/common_utils/rank.hpp b/projects/miopen/common_utils/include/common_utils/rank.hpp
new file mode 100644
index 000000000000..013ec6e7f7f4
--- /dev/null
+++ b/projects/miopen/common_utils/include/common_utils/rank.hpp
@@ -0,0 +1,42 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2017 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef GUARD_MIOPEN_RANK_HPP
+#define GUARD_MIOPEN_RANK_HPP
+
+namespace miopen {
+
+template <int N>
+struct rank : rank<N - 1>
+{
+};
+
+template <>
+struct rank<0>
+{
+};
+} // namespace miopen
+
+#endif
diff --git a/projects/miopen/common_utils/include/common_utils/returns.hpp b/projects/miopen/common_utils/include/common_utils/returns.hpp
new file mode 100644
index 000000000000..4fdb1db18b87
--- /dev/null
+++ b/projects/miopen/common_utils/include/common_utils/returns.hpp
@@ -0,0 +1,38 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2017 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#ifndef GUARD_MIOPEN_RETURNS_HPP
+#define GUARD_MIOPEN_RETURNS_HPP
+
+#define MIOPEN_RETURNS(...) \
+    ->decltype(__VA_ARGS__) { return __VA_ARGS__; }
+
+#define MIOPEN_BODY_RETURNS(...) \
+    {                            \
+        return __VA_ARGS__;      \
+    }
+
+#endif
diff --git a/projects/miopen/common_utils/include/common_utils/type_name.hpp b/projects/miopen/common_utils/include/common_utils/type_name.hpp
new file mode 100644
index 000000000000..ac7fd2ff6017
--- /dev/null
+++ b/projects/miopen/common_utils/include/common_utils/type_name.hpp
@@ -0,0 +1,139 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2017-2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#ifndef GUARD_TYPE_NAME_HPP
+#define GUARD_TYPE_NAME_HPP
+
+#include <string>
+#include <string_view>
+#if defined(_MSC_VER) && !defined(__clang__) && !defined(__GNUC__)
+#include <type_traits>
+#endif
+
+namespace miopen {
+
+template <class T>
+constexpr std::string_view type_name()
+{
+#if defined(__clang__) || defined(__GNUC__)
+    // clang or gcc
+    constexpr auto full_name = std::string_view{__PRETTY_FUNCTION__};
+#elif defined(_MSC_VER)
+    // msvc
+    constexpr auto full_name = std::string_view{__FUNCSIG__};
+#endif
+
+    // The substring with the data type name is located within the original string, between the
+    // prefix and the suffix, with the prefix always not at the beginning of the string and the
+    // suffix always at the end of the string.
+#if defined(__clang__)
+    // clang
+    constexpr auto prefix = std::string_view{"[T = "};
+    constexpr auto suffix = std::string_view{"]"};
+#elif defined(__GNUC__)
+    // gcc
+    constexpr auto prefix = std::string_view{"[with T = "};
+    constexpr auto suffix = std::string_view{"; std::string_view = std::basic_string_view<char>]"};
+#elif defined(_MSC_VER)
+    // msvc
+    constexpr auto prefix = std::string_view{"type_name<"};
+    constexpr auto suffix = std::string_view{">(void)"};
+#endif
+
+    constexpr auto prefix_pos = full_name.find(prefix);
+    static_assert(prefix_pos != std::string_view::npos);
+
+    constexpr auto suffix_pos = full_name.rfind(suffix);
+    static_assert(suffix_pos != std::string_view::npos);
+    static_assert(suffix_pos == full_name.size() - suffix.size());
+
+    constexpr auto pos = prefix_pos + prefix.size();
+    static_assert(pos < suffix_pos);
+    constexpr auto count = suffix_pos - pos;
+
+    constexpr auto name = full_name.substr(pos, count);
+
+#if defined(__clang__) || defined(__GNUC__)
+    // clang or gcc
+    return name;
+#elif defined(_MSC_VER)
+    // msvc
+    if constexpr(std::is_compound_v<T>)
+    {
+        // For compound data types, the string contains the keyword 'class/struct/union/enum' before
+        // the data type name, separated by a space.
+        constexpr auto sep     = std::string_view{" "};
+        constexpr auto sep_pos = name.find(sep);
+        static_assert(sep_pos != std::string_view::npos);
+        static_assert(sep_pos != 0); // must not be at the 0 position
+
+        constexpr auto name_pos = sep_pos + sep.size();
+        constexpr auto tname    = name.substr(name_pos);
+        static_assert(tname.size() > 0);
+
+        return tname;
+    }
+    else
+    {
+        return name;
+    }
+#endif
+}
+
+template <class T>
+constexpr std::string_view type_name_bare()
+{
+    constexpr auto name = type_name<T>();
+    constexpr auto pos  = name.rfind(':');
+    if constexpr(pos == std::string_view::npos)
+    {
+        constexpr auto result = name;
+        return result;
+    }
+    else
+    {
+        constexpr auto bare_name = name.substr(pos + 1);
+        static_assert(bare_name.size() > 0);
+        return bare_name;
+    }
+}
+
+template <class T>
+const std::string& get_type_name()
+{
+    static const auto ret = std::string(type_name<T>());
+    return ret;
+}
+
+template <class T>
+const std::string& get_type_name(const T&)
+{
+    return miopen::get_type_name<T>();
+}
+
+} // namespace miopen
+
+#endif
diff --git a/projects/miopen/driver/CMakeLists.txt b/projects/miopen/driver/CMakeLists.txt
index 4aac2358c432..693a3d47d599 100644
--- a/projects/miopen/driver/CMakeLists.txt
+++ b/projects/miopen/driver/CMakeLists.txt
@@ -74,7 +74,7 @@ endif()
 add_dependencies(MIOpenDriver generate_kernels)
 target_include_directories(MIOpenDriver PRIVATE ../src/kernels)
 # MIOpen_with_plugins ensures CK plugin .so's are built alongside MIOpenDriver
-target_link_libraries(MIOpenDriver PRIVATE MIOpen_with_plugins Threads::Threads roc::rocrand nlohmann_json::nlohmann_json )
+target_link_libraries(MIOpenDriver PRIVATE MIOpen_with_plugins Threads::Threads roc::rocrand nlohmann_json::nlohmann_json miopen_common_utils)
 if(NOT MIOPEN_EMBED_DB STREQUAL "")
     target_link_libraries(MIOpenDriver PRIVATE $<BUILD_INTERFACE:miopen_data> )
 endif()
diff --git a/projects/miopen/src/CMakeLists.txt b/projects/miopen/src/CMakeLists.txt
index 9e6f401b7506..3ba48b6ca763 100644
--- a/projects/miopen/src/CMakeLists.txt
+++ b/projects/miopen/src/CMakeLists.txt
@@ -931,7 +931,7 @@ endif()
 target_include_directories(MIOpen SYSTEM PUBLIC $<BUILD_INTERFACE:${HALF_INCLUDE_DIR}>)
 # Workaround : change in rocm-cmake was causing linking error so had to add ${CMAKE_DL_LIBS}
 #               We can remove ${CMAKE_DL_LIBS} once root cause is identified.
-target_link_libraries(MIOpen PRIVATE ${CMAKE_DL_LIBS} Threads::Threads BZip2::BZip2)
+target_link_libraries(MIOpen PRIVATE ${CMAKE_DL_LIBS} Threads::Threads BZip2::BZip2 miopen_common_utils)
 miopen_generate_export_header(MIOpen)
 
 if(WIN32)
diff --git a/projects/miopen/src/include/miopen/algorithm.hpp b/projects/miopen/src/include/miopen/algorithm.hpp
index d1098a066077..91b0383b823b 100644
--- a/projects/miopen/src/include/miopen/algorithm.hpp
+++ b/projects/miopen/src/include/miopen/algorithm.hpp
@@ -23,25 +23,8 @@
  * SOFTWARE.
  *
  *******************************************************************************/
+// Forwarding header -- implementation moved to common_utils.
 #ifndef GUARD_MLOPEN_ALGORITHM_HPP
 #define GUARD_MLOPEN_ALGORITHM_HPP
-
-#include <algorithm>
-
-namespace miopen {
-
-template <typename Range, typename Predicate>
-bool any_of(const Range& r, Predicate p)
-{
-    return std::any_of(r.begin(), r.end(), p);
-}
-
-template <typename Range, typename Predicate>
-bool all_of(const Range& r, Predicate p)
-{
-    return std::all_of(r.begin(), r.end(), p);
-}
-
-} // namespace miopen
-
+#include <common_utils/algorithm.hpp>
 #endif
diff --git a/projects/miopen/src/include/miopen/each_args.hpp b/projects/miopen/src/include/miopen/each_args.hpp
index e078153dc998..646fd53d263f 100644
--- a/projects/miopen/src/include/miopen/each_args.hpp
+++ b/projects/miopen/src/include/miopen/each_args.hpp
@@ -23,57 +23,8 @@
  * SOFTWARE.
  *
  *******************************************************************************/
+// Forwarding header -- implementation moved to common_utils.
 #ifndef GUARD_MIOPEN_EACH_ARGS_HPP
 #define GUARD_MIOPEN_EACH_ARGS_HPP
-
-#include <initializer_list>
-#include <type_traits>
-#include <utility>
-
-namespace miopen {
-namespace detail {
-
-template <class F, std::size_t... Ns, class... Ts>
-void each_args_i_impl(F f, std::index_sequence<Ns...>, Ts&&... xs)
-{
-    (void)std::initializer_list<int>{
-        (f(std::integral_constant<std::size_t, Ns>{}, std::forward<Ts>(xs)), 0)...};
-}
-
-template <class F, std::size_t... Ns, class T>
-auto unpack_impl(F f, std::index_sequence<Ns...>, T&& x)
-{
-    return f(std::get<Ns>(x)...);
-}
-
-} // namespace detail
-
-template <class F, class... Ts>
-void each_args_i(F f, Ts&&... xs)
-{
-    detail::each_args_i_impl(f, std::make_index_sequence<sizeof...(Ts)>(), std::forward<Ts>(xs)...);
-}
-
-template <class F, class... Ts>
-void each_args(F f, Ts&&... xs)
-{
-    (void)std::initializer_list<int>{(f(std::forward<Ts>(xs)), 0)...};
-}
-
-// Workaround for gcc warnings
-template <class F>
-void each_args(F)
-{
-}
-
-template <class F, std::size_t... Ns, class T>
-auto unpack(F f, T&& x)
-{
-    using type = typename std::remove_cv<typename std::remove_reference<T>::type>::type;
-    return detail::unpack_impl(
-        f, std::make_index_sequence<std::tuple_size<type>::value>(), std::forward<T>(x));
-}
-
-} // namespace miopen
-
+#include <common_utils/each_args.hpp>
 #endif
diff --git a/projects/miopen/src/include/miopen/float_equal.hpp b/projects/miopen/src/include/miopen/float_equal.hpp
index 24bbdc55ad11..43bd3d7ab14a 100644
--- a/projects/miopen/src/include/miopen/float_equal.hpp
+++ b/projects/miopen/src/include/miopen/float_equal.hpp
@@ -23,67 +23,8 @@
  * SOFTWARE.
  *
  *******************************************************************************/
+// Forwarding header -- implementation moved to common_utils.
 #ifndef GUARD_MLOPEN_FLOAT_EQUAL_HPP
 #define GUARD_MLOPEN_FLOAT_EQUAL_HPP
-
-#include <algorithm>
-#include <cmath>
-#include <limits>
-#include <numeric>
-
-namespace miopen {
-
-template <class... Ts>
-using common_type = typename std::common_type<Ts...>::type;
-
-struct float_equal_fn
-{
-    template <class T>
-    static bool apply(T x, T y)
-    {
-        // The standard library from MSVC does not implement std::isfinite() for integer
-        // types - no additional overloads are provided. According to the documentation,
-        // integer types should be treaded as doubles.
-        // Refer to https://en.cppreference.com/w/cpp/numeric/math/isfinite for more information.
-        return std::isfinite(static_cast<double>(x)) and std::isfinite(static_cast<double>(y)) and
-               std::nextafter(x, std::numeric_limits<T>::lowest()) <= y and
-               std::nextafter(x, std::numeric_limits<T>::max()) >= y;
-    }
-
-    template <class T, class U>
-    bool operator()(T x, U y) const
-    {
-        return float_equal_fn::apply<common_type<T, U>>(x, y);
-    }
-};
-
-static constexpr float_equal_fn float_equal{};
-
-/// Special case for comparing with a sentinel value
-struct float_equal_sentinel_fn
-{
-    template <class T>
-    static bool apply(T x, T y)
-    {
-// In this case we have to ignore this warning, because we intend to compare with the exact value
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Wfloat-equal"
-        bool equals_sentinel = x == y;
-#pragma clang diagnostic pop
-
-        return std::isfinite(static_cast<double>(x)) and std::isfinite(static_cast<double>(y)) and
-               equals_sentinel;
-    }
-
-    template <class T, class U>
-    bool operator()(T x, U y) const
-    {
-        return float_equal_sentinel_fn::apply<common_type<T, U>>(x, y);
-    }
-};
-
-static constexpr float_equal_sentinel_fn float_equal_sentinel{};
-
-} // namespace miopen
-
+#include <common_utils/float_equal.hpp>
 #endif
diff --git a/projects/miopen/src/include/miopen/ford.hpp b/projects/miopen/src/include/miopen/ford.hpp
index f56b20de4d46..0dc62c9ae495 100644
--- a/projects/miopen/src/include/miopen/ford.hpp
+++ b/projects/miopen/src/include/miopen/ford.hpp
@@ -1,122 +1,5 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2017 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
+// Forwarding header — implementation moved to common_utils.
 #ifndef GUARD_FORD_HPP
 #define GUARD_FORD_HPP
-
-#include <algorithm>
-#include <array>
-#include <cassert>
-#include <cmath>
-#include <functional>
-#include <miopen/par_for.hpp>
-#include <miopen/each_args.hpp>
-#include <miopen/returns.hpp>
-#include <numeric>
-#include <vector>
-
-#include <thread>
-
-#include <future>
-
-namespace miopen {
-
-// An improved async, that doesn't block
-template <class Function>
-std::future<typename std::invoke_result<Function>::type> detach_async(Function&& f)
-{
-    using result_type = typename std::invoke_result<Function>::type;
-    std::packaged_task<result_type()> task(std::forward<Function>(f));
-    auto fut = task.get_future();
-    std::thread(std::move(task)).detach();
-    return fut;
-}
-
-template <class T, class Work>
-auto then(std::future<T> f, Work w) -> std::future<decltype(w(f.get()))>
-{
-    return std::async(std::launch::deferred,
-                      [=, f_ = std::move(f)]() mutable { return w(f_.get()); });
-}
-
-template <class T>
-struct ford_wrapper
-{
-    template <class... Ts>
-    auto operator()(Ts... xs) const MIOPEN_RETURNS(std::bind(T{}, std::placeholders::_1, xs...));
-};
-
-// Multidimensional for loop
-struct ford_impl
-{
-    template <class F>
-    void operator()(F f) const
-    {
-        f();
-    }
-
-    template <class F, class T, class... Ts>
-    void operator()(F f, T x, Ts... xs) const
-    {
-        // Workaround for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=55914
-        for(T i = 0; i < x; i++)
-        {
-            (*this)([&](Ts... is) { f(i, is...); }, xs...);
-        }
-    }
-};
-
-static constexpr ford_wrapper<ford_impl> ford{};
-
-struct par_ford_impl
-{
-    template <class F, class... Ts>
-    void operator()(F f, Ts... xs) const
-    {
-        using array_type = std::array<std::size_t, sizeof...(Ts)>;
-        array_type lens  = {{static_cast<std::size_t>(xs)...}};
-        array_type strides;
-        strides.fill(1);
-        std::partial_sum(
-            lens.rbegin(), lens.rend() - 1, strides.rbegin() + 1, std::multiplies<std::size_t>());
-        auto size = std::accumulate(
-            lens.begin(), lens.end(), static_cast<std::size_t>(1), std::multiplies<std::size_t>());
-        par_for(size, [&](std::size_t i) {
-            array_type indices;
-            std::transform(strides.begin(),
-                           strides.end(),
-                           lens.begin(),
-                           indices.begin(),
-                           [&](size_t stride, size_t len) { return (i / stride) % len; });
-            unpack(f, indices);
-        });
-    }
-};
-
-static constexpr ford_wrapper<par_ford_impl> par_ford{};
-
-} // namespace miopen
-
+#include <common_utils/ford.hpp>
 #endif
diff --git a/projects/miopen/src/include/miopen/functional.hpp b/projects/miopen/src/include/miopen/functional.hpp
index 02c6e3427e87..d1f7cb973349 100644
--- a/projects/miopen/src/include/miopen/functional.hpp
+++ b/projects/miopen/src/include/miopen/functional.hpp
@@ -1,131 +1,5 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2017 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
+// Forwarding header — implementation moved to common_utils.
 #ifndef GUARD_MLOPEN_FUNCTIONAL_HPP
 #define GUARD_MLOPEN_FUNCTIONAL_HPP
-
-#include <miopen/each_args.hpp>
-#include <miopen/returns.hpp>
-#include <utility>
-
-namespace miopen {
-namespace detail {
-
-template <class F, std::size_t... Ns>
-auto each_i_impl(F f, std::index_sequence<Ns...>)
-    MIOPEN_RETURNS(f(std::integral_constant<std::size_t, Ns>{}...));
-} // namespace detail
-
-template <class F, class P>
-struct by_t
-{
-    F f;
-    P p;
-    template <class... Ts>
-    auto operator()(Ts&&... xs) const MIOPEN_RETURNS(f(p(std::forward<Ts>(xs))...))
-};
-
-template <class F, class P>
-by_t<F, P> by(F f, P p)
-{
-    return {std::move(f), std::move(p)};
-}
-
-template <class F, class G>
-struct compose_t
-{
-    F f;
-    G g;
-    template <class... Ts>
-    auto operator()(Ts&&... xs) const MIOPEN_RETURNS(f(g(std::forward<Ts>(xs)...)))
-};
-
-template <class F, class G>
-compose_t<F, G> compose(F f, G g)
-{
-    return {std::move(f), std::move(g)};
-}
-
-template <class F>
-struct flip_t
-{
-    F f;
-    template <class T, class U>
-    auto operator()(T&& x, U&& y) const MIOPEN_RETURNS(f(std::forward<U>(y), std::forward<T>(x)))
-};
-
-template <class F>
-flip_t<F> flip(F f)
-{
-    return {std::move(f)};
-}
-
-template <class F>
-struct sequence_t
-{
-    F f;
-    template <class IntegralConstant>
-    auto operator()(IntegralConstant) const
-        MIOPEN_RETURNS(detail::each_i_impl(f, std::make_index_sequence<IntegralConstant::value>()));
-};
-
-template <class F>
-sequence_t<F> sequence(F f)
-{
-    return {std::move(f)};
-}
-
-template <typename F, std::size_t N>
-void repeat_n(F f, std::integral_constant<std::size_t, N>)
-{
-    auto fs = [&f](auto... is) { return each_args(f, is...); };
-    sequence(fs)(std::integral_constant<std::size_t, N>{});
-}
-
-template <class T>
-struct cast_to
-{
-    template <class X>
-    T operator()(X&& x) const
-    {
-        return static_cast<T>(std::forward<X>(x));
-    }
-};
-
-template <class F>
-auto unpacker(F f)
-{
-    return [=](auto xs) { return miopen::unpack(f, xs); };
-};
-
-template <class F, class... Xs>
-auto prepender(F f, Xs... xs)
-{
-    return [=](auto... ys) { return f(xs..., ys...); };
-}
-
-} // namespace miopen
-
+#include <common_utils/functional.hpp>
 #endif
diff --git a/projects/miopen/src/include/miopen/par_for.hpp b/projects/miopen/src/include/miopen/par_for.hpp
index 1272dcf6ac9b..71a1125de408 100644
--- a/projects/miopen/src/include/miopen/par_for.hpp
+++ b/projects/miopen/src/include/miopen/par_for.hpp
@@ -23,127 +23,8 @@
  * SOFTWARE.
  *
  *******************************************************************************/
-
+// Forwarding header -- implementation moved to common_utils.
 #ifndef MIOPEN_GUARD_MLOPEN_PAR_FOR_HPP
 #define MIOPEN_GUARD_MLOPEN_PAR_FOR_HPP
-
-#include <algorithm>
-#include <cassert>
-#include <cmath>
-#include <functional>
-#include <numeric>
-#include <vector>
-
-#include <thread>
-
-namespace miopen {
-
-struct joinable_thread : std::thread
-{
-    template <class... Xs>
-    joinable_thread(Xs&&... xs) : std::thread(std::forward<Xs>(xs)...) // NOLINT
-    {
-    }
-
-    joinable_thread& operator=(joinable_thread&& other) = default;
-    joinable_thread(joinable_thread&& other)            = default;
-
-    ~joinable_thread()
-    {
-        if(this->joinable())
-            this->join();
-    }
-};
-
-struct thread_factory
-{
-    template <class F>
-    joinable_thread operator()(std::size_t& work, std::size_t n, std::size_t grainsize, F f) const
-    {
-        auto result = joinable_thread([=] {
-            std::size_t start = work;
-            std::size_t last  = std::min(n, work + grainsize);
-            for(std::size_t i = start; i < last; i++)
-            {
-                f(i);
-            }
-        });
-        work += grainsize;
-        return result;
-    }
-};
-
-template <class F>
-void par_for_impl(std::size_t n, std::size_t threadsize, F f)
-{
-    if(threadsize <= 1)
-    {
-        for(std::size_t i = 0; i < n; i++)
-            f(i);
-    }
-    else
-    {
-        std::vector<joinable_thread> threads(threadsize);
-        const std::size_t grainsize = std::ceil(static_cast<double>(n) / threads.size());
-
-        std::size_t work = 0;
-        std::generate(threads.begin(),
-                      threads.end(),
-                      std::bind(thread_factory{}, std::ref(work), n, grainsize, f));
-        assert(work >= n);
-    }
-}
-
-template <class F>
-void par_for(std::size_t n, std::size_t min_grain, F f)
-{
-    const auto threadsize =
-        std::min<std::size_t>(std::thread::hardware_concurrency(), n / min_grain);
-    par_for_impl(n, threadsize, f);
-}
-
-struct min_grain
-{
-    std::size_t n = 0;
-};
-
-template <class F>
-void par_for(std::size_t n, min_grain mg, F f)
-{
-    const auto threadsize = std::min<std::size_t>(std::thread::hardware_concurrency(), n / mg.n);
-    par_for_impl(n, threadsize, f);
-}
-
-template <class F>
-void par_for(std::size_t n, F f)
-{
-    par_for(n, min_grain{8}, f);
-}
-
-struct max_threads
-{
-    std::size_t n = 0;
-};
-
-template <class F>
-void par_for(std::size_t n, max_threads mt, F f)
-{
-    const auto threadsize = std::min<std::size_t>(std::thread::hardware_concurrency(), mt.n);
-    par_for_impl(n, std::min(threadsize, n), f);
-}
-
-template <class F>
-void par_for_strided(std::size_t n, max_threads mt, F f)
-{
-    auto threadsize = std::min<std::size_t>(std::thread::hardware_concurrency(), mt.n);
-    par_for_impl(threadsize, threadsize, [&](auto start) {
-        for(std::size_t i = start; i < n; i += threadsize)
-        {
-            f(i);
-        }
-    });
-}
-
-} // namespace miopen
-
+#include <common_utils/par_for.hpp>
 #endif
diff --git a/projects/miopen/src/include/miopen/rank.hpp b/projects/miopen/src/include/miopen/rank.hpp
index 013ec6e7f7f4..1756782673ad 100644
--- a/projects/miopen/src/include/miopen/rank.hpp
+++ b/projects/miopen/src/include/miopen/rank.hpp
@@ -23,20 +23,8 @@
  * SOFTWARE.
  *
  *******************************************************************************/
+// Forwarding header -- implementation moved to common_utils.
 #ifndef GUARD_MIOPEN_RANK_HPP
 #define GUARD_MIOPEN_RANK_HPP
-
-namespace miopen {
-
-template <int N>
-struct rank : rank<N - 1>
-{
-};
-
-template <>
-struct rank<0>
-{
-};
-} // namespace miopen
-
+#include <common_utils/rank.hpp>
 #endif
diff --git a/projects/miopen/src/include/miopen/returns.hpp b/projects/miopen/src/include/miopen/returns.hpp
index 4fdb1db18b87..dd0873cfb2b3 100644
--- a/projects/miopen/src/include/miopen/returns.hpp
+++ b/projects/miopen/src/include/miopen/returns.hpp
@@ -23,16 +23,8 @@
  * SOFTWARE.
  *
  *******************************************************************************/
-
+// Forwarding header -- implementation moved to common_utils.
 #ifndef GUARD_MIOPEN_RETURNS_HPP
 #define GUARD_MIOPEN_RETURNS_HPP
-
-#define MIOPEN_RETURNS(...) \
-    ->decltype(__VA_ARGS__) { return __VA_ARGS__; }
-
-#define MIOPEN_BODY_RETURNS(...) \
-    {                            \
-        return __VA_ARGS__;      \
-    }
-
+#include <common_utils/returns.hpp>
 #endif
diff --git a/projects/miopen/src/include/miopen/type_name.hpp b/projects/miopen/src/include/miopen/type_name.hpp
index ac7fd2ff6017..d2cce63d3d32 100644
--- a/projects/miopen/src/include/miopen/type_name.hpp
+++ b/projects/miopen/src/include/miopen/type_name.hpp
@@ -23,117 +23,8 @@
  * SOFTWARE.
  *
  *******************************************************************************/
-
+// Forwarding header -- implementation moved to common_utils.
 #ifndef GUARD_TYPE_NAME_HPP
 #define GUARD_TYPE_NAME_HPP
-
-#include <string>
-#include <string_view>
-#if defined(_MSC_VER) && !defined(__clang__) && !defined(__GNUC__)
-#include <type_traits>
-#endif
-
-namespace miopen {
-
-template <class T>
-constexpr std::string_view type_name()
-{
-#if defined(__clang__) || defined(__GNUC__)
-    // clang or gcc
-    constexpr auto full_name = std::string_view{__PRETTY_FUNCTION__};
-#elif defined(_MSC_VER)
-    // msvc
-    constexpr auto full_name = std::string_view{__FUNCSIG__};
-#endif
-
-    // The substring with the data type name is located within the original string, between the
-    // prefix and the suffix, with the prefix always not at the beginning of the string and the
-    // suffix always at the end of the string.
-#if defined(__clang__)
-    // clang
-    constexpr auto prefix = std::string_view{"[T = "};
-    constexpr auto suffix = std::string_view{"]"};
-#elif defined(__GNUC__)
-    // gcc
-    constexpr auto prefix = std::string_view{"[with T = "};
-    constexpr auto suffix = std::string_view{"; std::string_view = std::basic_string_view<char>]"};
-#elif defined(_MSC_VER)
-    // msvc
-    constexpr auto prefix = std::string_view{"type_name<"};
-    constexpr auto suffix = std::string_view{">(void)"};
-#endif
-
-    constexpr auto prefix_pos = full_name.find(prefix);
-    static_assert(prefix_pos != std::string_view::npos);
-
-    constexpr auto suffix_pos = full_name.rfind(suffix);
-    static_assert(suffix_pos != std::string_view::npos);
-    static_assert(suffix_pos == full_name.size() - suffix.size());
-
-    constexpr auto pos = prefix_pos + prefix.size();
-    static_assert(pos < suffix_pos);
-    constexpr auto count = suffix_pos - pos;
-
-    constexpr auto name = full_name.substr(pos, count);
-
-#if defined(__clang__) || defined(__GNUC__)
-    // clang or gcc
-    return name;
-#elif defined(_MSC_VER)
-    // msvc
-    if constexpr(std::is_compound_v<T>)
-    {
-        // For compound data types, the string contains the keyword 'class/struct/union/enum' before
-        // the data type name, separated by a space.
-        constexpr auto sep     = std::string_view{" "};
-        constexpr auto sep_pos = name.find(sep);
-        static_assert(sep_pos != std::string_view::npos);
-        static_assert(sep_pos != 0); // must not be at the 0 position
-
-        constexpr auto name_pos = sep_pos + sep.size();
-        constexpr auto tname    = name.substr(name_pos);
-        static_assert(tname.size() > 0);
-
-        return tname;
-    }
-    else
-    {
-        return name;
-    }
-#endif
-}
-
-template <class T>
-constexpr std::string_view type_name_bare()
-{
-    constexpr auto name = type_name<T>();
-    constexpr auto pos  = name.rfind(':');
-    if constexpr(pos == std::string_view::npos)
-    {
-        constexpr auto result = name;
-        return result;
-    }
-    else
-    {
-        constexpr auto bare_name = name.substr(pos + 1);
-        static_assert(bare_name.size() > 0);
-        return bare_name;
-    }
-}
-
-template <class T>
-const std::string& get_type_name()
-{
-    static const auto ret = std::string(type_name<T>());
-    return ret;
-}
-
-template <class T>
-const std::string& get_type_name(const T&)
-{
-    return miopen::get_type_name<T>();
-}
-
-} // namespace miopen
-
+#include <common_utils/type_name.hpp>
 #endif
diff --git a/projects/miopen/test/CMakeLists.txt b/projects/miopen/test/CMakeLists.txt
index 57601d45ceaf..bef91d0ea871 100755
--- a/projects/miopen/test/CMakeLists.txt
+++ b/projects/miopen/test/CMakeLists.txt
@@ -414,9 +414,9 @@ function(add_test_executable TEST_NAME)
     endif()
     # MIOpen_with_plugins ensures CK plugin .so's are built alongside the test
     if(NOT MIOPEN_EMBED_DB STREQUAL "")
-        target_link_libraries(${TEST_NAME} MIOpen_with_plugins miopen_data)
+        target_link_libraries(${TEST_NAME} MIOpen_with_plugins miopen_data miopen_common_utils)
     else()
-        target_link_libraries(${TEST_NAME} MIOpen_with_plugins)
+        target_link_libraries(${TEST_NAME} MIOpen_with_plugins miopen_common_utils)
     endif()
     target_include_directories(${TEST_NAME} PRIVATE ../src/kernels)
     if(WIN32)

From 69c619ea64d55fc0660260b8755ae97b7d305f81 Mon Sep 17 00:00:00 2001
From: Brad Pepers <brad.pepers@amd.com>
Date: Sat, 9 May 2026 07:14:21 -0600
Subject: [PATCH 02/11] Move bfloat16, stringutils, reduce_common, random to
 common_utils

Continue populating the common_utils shared utility library:

- bfloat16.hpp: Removed miopen/config.h dependency, MIOPEN_USE_RNE_BFLOAT16
  now provided via CMake compile definition on the INTERFACE target
- stringutils.hpp: Replaced miopen/errors.hpp dependency with
  std::runtime_error, updated algorithm include to common_utils path
- reduce_common.hpp: Updated bfloat16 include to common_utils path
- random.hpp: Moved from driver/ to common_utils/ to break the circular
  dependency between driver/ and test/. Note: still depends on
  miopen/env.hpp (to be cleaned up in Phase 2)

Forwarding headers left at all original locations.

Co-Authored-By: Claude Sonnet 4 <noreply@anthropic.com>
---
 projects/miopen/common_utils/CMakeLists.txt   |   9 +
 .../include/common_utils/bfloat16.hpp         | 179 ++++++++++++++++++
 .../include/common_utils/random.hpp           | 159 ++++++++++++++++
 .../include/common_utils/reduce_common.hpp    |  66 +++++++
 .../include/common_utils/stringutils.hpp      | 165 ++++++++++++++++
 projects/miopen/driver/random.hpp             | 160 +---------------
 .../miopen/src/include/miopen/bfloat16.hpp    | 178 +----------------
 .../src/include/miopen/reduce_common.hpp      |  65 +------
 .../miopen/src/include/miopen/stringutils.hpp | 166 +---------------
 9 files changed, 588 insertions(+), 559 deletions(-)
 create mode 100644 projects/miopen/common_utils/include/common_utils/bfloat16.hpp
 create mode 100644 projects/miopen/common_utils/include/common_utils/random.hpp
 create mode 100644 projects/miopen/common_utils/include/common_utils/reduce_common.hpp
 create mode 100644 projects/miopen/common_utils/include/common_utils/stringutils.hpp

diff --git a/projects/miopen/common_utils/CMakeLists.txt b/projects/miopen/common_utils/CMakeLists.txt
index c0f4620a3439..1afb185255c9 100644
--- a/projects/miopen/common_utils/CMakeLists.txt
+++ b/projects/miopen/common_utils/CMakeLists.txt
@@ -33,3 +33,12 @@ add_library(MIOpen::common_utils ALIAS miopen_common_utils)
 target_include_directories(miopen_common_utils INTERFACE
     $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
 )
+
+# bfloat16.hpp needs to know the rounding mode.
+# This option is also defined in src/CMakeLists.txt for backward compatibility.
+option(MIOPEN_USE_RNE_BFLOAT16 "Sets rounding scheme for bfloat16 type" ON)
+if(MIOPEN_USE_RNE_BFLOAT16)
+    target_compile_definitions(miopen_common_utils INTERFACE MIOPEN_USE_RNE_BFLOAT16=1)
+else()
+    target_compile_definitions(miopen_common_utils INTERFACE MIOPEN_USE_RNE_BFLOAT16=0)
+endif()
diff --git a/projects/miopen/common_utils/include/common_utils/bfloat16.hpp b/projects/miopen/common_utils/include/common_utils/bfloat16.hpp
new file mode 100644
index 000000000000..71fe70bbd3c7
--- /dev/null
+++ b/projects/miopen/common_utils/include/common_utils/bfloat16.hpp
@@ -0,0 +1,179 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#ifndef BFLOAT16_H_
+#define BFLOAT16_H_
+
+#include <iostream>
+// MIOPEN_USE_RNE_BFLOAT16 is provided via CMake compile definitions.
+
+class bfloat16
+{
+public:
+    bfloat16() : data_{0} {}
+    explicit bfloat16(float rhs)
+    {
+        union
+        {
+            float float_st;
+            std::uint32_t bf16_st;
+        } bits_st = {rhs};
+
+        // BF16 round and NaN preservation code matches
+        // https://github.com/ROCm/rocBLAS/blob/develop/library/include/rocblas_bfloat16.h
+        if((~bits_st.bf16_st & 0x7f800000) == 0) // Inf or NaN
+        {
+            // When all of the exponent bits are 1, the value is Inf or NaN.
+            // Inf is indicated by a zero mantissa. NaN is indicated by any nonzero
+            // mantissa bit. Quiet NaN is indicated by the most significant mantissa
+            // bit being 1. Signaling NaN is indicated by the most significant
+            // mantissa bit being 0 but some other bit(s) being 1. If any of the
+            // lower 16 bits of the mantissa are 1, we set the least significant bit
+            // of the bfloat16 mantissa, in order to preserve signaling NaN in case
+            // the bloat16's mantissa bits are all 0.
+            if((bits_st.bf16_st & 0xffff) != 0)
+            {
+                bits_st.bf16_st |= 0x10000; // Preserve signaling NaN
+            }
+        }
+        else
+        {
+#if MIOPEN_USE_RNE_BFLOAT16 == 1
+            // When the exponent bits are not all 1s, then the value is zero, normal,
+            // or subnormal. We round the bfloat16 mantissa up by adding 0x7FFF, plus
+            // 1 if the least significant bit of the bfloat16 mantissa is 1 (odd).
+            // This causes the bfloat16's mantissa to be incremented by 1 if the 16
+            // least significant bits of the float mantissa are greater than 0x8000,
+            // or if they are equal to 0x8000 and the least significant bit of the
+            // bfloat16 mantissa is 1 (odd). This causes it to be rounded to even when
+            // the lower 16 bits are exactly 0x8000. If the bfloat16 mantissa already
+            // has the value 0x7f, then incrementing it causes it to become 0x00 and
+            // the exponent is incremented by one, which is the next higher FP value
+            // to the unrounded bfloat16 value. When the bfloat16 value is subnormal
+            // with an exponent of 0x00 and a mantissa of 0x7F, it may be rounded up
+            // to a normal value with an exponent of 0x01 and a mantissa of 0x00.
+            // When the bfloat16 value has an exponent of 0xFE and a mantissa of 0x7F,
+            // incrementing it causes it to become an exponent of 0xFF and a mantissa
+            // of 0x00, which is Inf, the next higher value to the unrounded value.
+            bits_st.bf16_st +=
+                (0x7fff + ((bits_st.bf16_st >> 16) & 1)); // Round to nearest, round to even
+#else                                                     // truncation
+// do nothing
+#endif
+        }
+        data_ = bits_st.bf16_st >> 16;
+    }
+    operator float() const
+    {
+        union
+        {
+            std::uint32_t bf16_st;
+            float float_st;
+        } bits_st = {data_};
+
+        bits_st.bf16_st = bits_st.bf16_st << 16;
+        return bits_st.float_st;
+    }
+
+    bfloat16 operator-() const { return bfloat16(-static_cast<float>(*this)); }
+    bfloat16 operator+() const { return *this; }
+
+    bfloat16& operator=(const float rhs)
+    {
+        *this = bfloat16(rhs);
+        return *this;
+    }
+    bfloat16& operator+=(bfloat16 rhs)
+    {
+        *this = bfloat16(static_cast<float>(*this) + static_cast<float>(rhs));
+        return *this;
+    }
+
+    bfloat16& operator+=(float rhs)
+    {
+        *this = bfloat16(static_cast<float>(*this) + rhs);
+        return *this;
+    }
+
+    bfloat16& operator-=(bfloat16 rhs)
+    {
+        *this += -rhs;
+        return *this;
+    }
+    bfloat16& operator*=(bfloat16 rhs)
+    {
+        *this = bfloat16(static_cast<float>(*this) * static_cast<float>(rhs));
+        return *this;
+    }
+    bfloat16& operator*=(float rhs)
+    {
+        *this = bfloat16(static_cast<float>(*this) * rhs);
+        return *this;
+    }
+
+    bfloat16& operator/=(bfloat16 rhs)
+    {
+        *this = bfloat16(static_cast<float>(*this) / static_cast<float>(rhs));
+        return *this;
+    }
+    bool operator<(bfloat16 rhs) const
+    {
+        return static_cast<float>(*this) < static_cast<float>(rhs);
+    }
+    bool operator==(bfloat16 rhs) const { return std::equal_to<float>()(*this, rhs); }
+
+    static constexpr bfloat16 generate(uint16_t val) { return bfloat16{val, true}; }
+
+private:
+    constexpr bfloat16(std::uint16_t val, bool) : data_{val} {}
+
+    std::uint16_t data_;
+};
+
+inline bfloat16 operator+(bfloat16 a, const bfloat16& b)
+{
+    a += b;
+    return a;
+}
+
+inline bfloat16 operator-(bfloat16 a, const bfloat16& b)
+{
+    a -= b;
+    return a;
+}
+
+inline bfloat16 operator*(bfloat16 a, const bfloat16& b)
+{
+    a *= b;
+    return a;
+}
+
+inline bfloat16 operator/(bfloat16 a, const bfloat16& b)
+{
+    a /= b;
+    return a;
+}
+
+namespace std {
+template <>
+class numeric_limits<bfloat16>
+{
+public:
+    static constexpr bool is_specialized = true;
+    static constexpr bfloat16 min() noexcept { return bfloat16::generate(0x0080); } // 0x1.00p-126
+    static constexpr bfloat16 max() noexcept { return bfloat16::generate(0x7F7F); }
+    static constexpr bfloat16 lowest() noexcept { return bfloat16::generate(0xFF7F); }
+    static constexpr bfloat16 epsilon() noexcept { return bfloat16::generate(0x3C00); }
+    static constexpr bfloat16 infinity() noexcept { return bfloat16::generate(0x7F80); }
+    static constexpr bfloat16 quiet_NaN() noexcept { return bfloat16::generate(0x7FC0); } // qnan(0)
+    static constexpr bfloat16 signaling_NaN() noexcept
+    {
+        return bfloat16::generate(0x7F81); // snan(1)
+    }
+    static constexpr bfloat16 denorm_min() noexcept
+    {
+        return bfloat16::generate(0x0001); // 0x0.02p-126
+    }
+};
+} // namespace std
+#endif
diff --git a/projects/miopen/common_utils/include/common_utils/random.hpp b/projects/miopen/common_utils/include/common_utils/random.hpp
new file mode 100644
index 000000000000..f6f8d85c4ce4
--- /dev/null
+++ b/projects/miopen/common_utils/include/common_utils/random.hpp
@@ -0,0 +1,159 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2025 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef GUARD_RANDOM_GEN_
+#define GUARD_RANDOM_GEN_
+
+#include <miopen/env.hpp>
+
+#include <cassert>
+#include <iostream>
+#include <random>
+
+MIOPEN_DECLARE_ENV_VAR_UINT64(MIOPEN_DEBUG_DRIVER_PRNG_SEED, 12345678)
+
+namespace env = miopen::env;
+
+namespace prng {
+namespace details {
+using glibc_gen = std::linear_congruential_engine<std::uint32_t, 1103515245, 12345, 2147483648>;
+
+inline std::random_device::result_type get_default_seed()
+{
+    static std::random_device::result_type seed{[] {
+        auto external_seed = env::value(MIOPEN_DEBUG_DRIVER_PRNG_SEED);
+
+        auto seed_ = external_seed == 0
+                         ? std::random_device{}()
+                         : static_cast<std::random_device::result_type>(external_seed);
+        std::cout << "PRNG seed: " << seed_ << "\n";
+        return seed_;
+    }()};
+    return seed;
+}
+
+inline glibc_gen& get_prng()
+{
+    static thread_local glibc_gen gen{get_default_seed()};
+    return gen;
+}
+
+template <class, class = void>
+struct has_digits : std::false_type
+{
+};
+
+template <class T>
+struct has_digits<T, std::void_t<decltype(std::numeric_limits<T>::digits)>> : std::true_type
+{
+};
+
+} // namespace details
+
+inline void reset_seed(std::random_device::result_type seed = 0)
+{
+    details::get_prng().seed(seed + details::get_default_seed());
+}
+
+// similar to std::generate_canonical, but simpler and faster
+template <typename T>
+inline T gen_canonical()
+{
+    if constexpr(std::is_floating_point_v<T>) // native fp
+    {
+        static constexpr T range =
+            static_cast<T>(1) /
+            static_cast<T>(details::glibc_gen::max() - details::glibc_gen::min() + 1);
+        return range * static_cast<T>(details::get_prng()() - details::glibc_gen::min());
+    }
+    else if constexpr(std::is_integral_v<T>)
+    {
+        auto val = details::get_prng()();
+        return static_cast<T>(((val >> 4) + (val >> 16)) & 0x1);
+    }
+    else
+    {
+        return static_cast<T>(gen_canonical<float>());
+    }
+}
+
+template <typename T>
+inline T gen_0_to_B(T B)
+{
+    if constexpr(std::is_floating_point_v<T>) // native fp
+    {
+        return gen_canonical<T>() * B;
+    }
+    else if constexpr(std::is_integral_v<T>)
+    {
+        // can only generate 27bit range, so it may not be suitable
+        // for huge 64 bit ranges, but we do not expect such ranges
+        return static_cast<T>((details::get_prng()() >> 4) % B);
+    }
+    else // half/bfloat/etc
+    {
+        return static_cast<T>(gen_0_to_B(static_cast<float>(B)));
+    }
+}
+
+template <typename T>
+inline T gen_A_to_B(T A, T B)
+{
+    assert(B > A);
+    return gen_0_to_B(B - A) + A;
+}
+
+template <typename T>
+inline T gen_off_range(T offset, T range)
+{
+    static_assert(std::is_integral_v<T>);
+    return prng::gen_0_to_B(range) + offset;
+}
+
+template <typename T, bool Signed = false>
+inline T gen_subnorm()
+{
+    T denorm_val = static_cast<T>(0);
+    if constexpr(!std::is_integral_v<T> && !std::is_same_v<T, double> &&
+                 std::is_trivially_copyable<T>::value && details::has_digits<T>::value)
+    {
+        using BitType = std::conditional_t<sizeof(T) == 1,
+                                           uint8_t,
+                                           std::conditional_t<sizeof(T) == 2, uint16_t, uint32_t>>;
+        static_assert(sizeof(T) == sizeof(BitType));
+
+        // -1 because ::digits counts the first implicit digit
+        static constexpr auto mantissa_bits = std::numeric_limits<T>::digits - 1;
+
+        BitType denorm_bits = static_cast<BitType>(gen_0_to_B(1 << mantissa_bits));
+        denorm_bits |= Signed ? (gen_canonical<BitType>() << (sizeof(T) * 8 - 1)) : 0;
+
+        // the proper way to do a type punning
+        std::memcpy(&denorm_val, &denorm_bits, sizeof(T));
+    }
+    return denorm_val;
+}
+} // namespace prng
+#endif // GUARD_RANDOM_GEN_
diff --git a/projects/miopen/common_utils/include/common_utils/reduce_common.hpp b/projects/miopen/common_utils/include/common_utils/reduce_common.hpp
new file mode 100644
index 000000000000..74ce541f694b
--- /dev/null
+++ b/projects/miopen/common_utils/include/common_utils/reduce_common.hpp
@@ -0,0 +1,66 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2020 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef GUARD_MIOPEN_REDUCE_COMMON_HPP
+#define GUARD_MIOPEN_REDUCE_COMMON_HPP
+
+#include <half/half.hpp>
+#include <common_utils/bfloat16.hpp>
+
+namespace reduce {
+
+template <typename Tdst, typename Tsrc>
+static inline Tdst convert_type(Tsrc x)
+{
+    return static_cast<Tdst>(x);
+}
+
+template <>
+inline float convert_type<float>(half_float::half x)
+{
+    return half_float::half_cast<float>(x);
+};
+
+template <>
+inline half_float::half convert_type<half_float::half>(float x)
+{
+    return half_float::half_cast<half_float::half>(x);
+};
+
+template <>
+inline float convert_type<float>(bfloat16 x)
+{
+    return float(x);
+};
+
+template <>
+inline bfloat16 convert_type<bfloat16>(float x)
+{
+    return bfloat16(x);
+};
+
+}; // end of namespace reduce
+
+#endif
diff --git a/projects/miopen/common_utils/include/common_utils/stringutils.hpp b/projects/miopen/common_utils/include/common_utils/stringutils.hpp
new file mode 100644
index 000000000000..19d579014c73
--- /dev/null
+++ b/projects/miopen/common_utils/include/common_utils/stringutils.hpp
@@ -0,0 +1,165 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2017 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef GUARD_MIOPEN_STRINGUTILS_HPP
+#define GUARD_MIOPEN_STRINGUTILS_HPP
+
+#include <common_utils/algorithm.hpp>
+#include <algorithm>
+#include <stdexcept>
+#include <iterator>
+#include <numeric>
+#include <string>
+#include <vector>
+#include <sstream>
+
+#define MIOPEN_STRINGIZE_1(...) #__VA_ARGS__
+#define MIOPEN_STRINGIZE(...) MIOPEN_STRINGIZE_1(__VA_ARGS__)
+
+namespace miopen {
+
+inline std::string
+ReplaceString(const std::string& in, const std::string& search, const std::string& replace)
+{
+    size_t pos = 0;
+    std::string subject(in);
+    while((pos = subject.find(search, pos)) != std::string::npos)
+    {
+        subject.replace(pos, search.length(), replace);
+        pos += replace.length();
+    }
+    return subject;
+}
+
+inline bool EndsWith(const std::string& value, const std::string& suffix)
+{
+    if(suffix.size() > value.size())
+        return false;
+    else
+        return std::equal(suffix.rbegin(), suffix.rend(), value.rbegin());
+}
+
+template <class Strings>
+inline std::string JoinStrings(Strings strings, std::string delim)
+{
+    auto it = strings.begin();
+    if(it == strings.end())
+        return "";
+
+    auto nit = std::next(it);
+    return std::accumulate(
+        nit, strings.end(), *it, [&](std::string x, std::string y) { return x + delim + y; });
+}
+
+template <class F>
+static inline std::string TransformString(std::string s, F f)
+{
+    std::transform(s.begin(), s.end(), s.begin(), f);
+    return s;
+}
+
+inline std::string ToUpper(std::string s) { return TransformString(std::move(s), ::toupper); }
+
+inline bool StartsWith(const std::string& value, const std::string& prefix)
+{
+    if(prefix.size() > value.size())
+        return false;
+    else
+        return std::equal(prefix.begin(), prefix.end(), value.begin());
+}
+
+inline std::string RemovePrefix(std::string s, std::string prefix)
+{
+    if(StartsWith(s, prefix))
+        return s.substr(prefix.length());
+    else
+        return s;
+}
+
+inline std::vector<std::string> SplitSpaceSeparated(const std::string& in)
+{
+    std::istringstream ss(in);
+    const std::istream_iterator<std::string> begin(ss), end;
+    return {begin, end};
+}
+
+inline std::vector<std::string> SplitSpaceSeparated(const std::vector<std::string>& in)
+{
+    std::vector<std::string> rv;
+    for(const auto& item : in)
+    {
+        if(item.find(' ') != std::string::npos)
+        {
+            const auto splitted = SplitSpaceSeparated(item);
+            std::copy(splitted.begin(), splitted.end(), std::back_inserter(rv));
+        }
+        else
+        {
+            rv.emplace_back(item);
+        }
+    }
+    return rv;
+}
+
+inline std::vector<std::string> SplitSpaceSeparated(const std::string& in,
+                                                    const std::vector<std::string>& dontSplitAfter)
+{
+    std::vector<std::string> rv;
+    std::istringstream ss(in);
+    std::string s;
+    while(ss >> s)
+    {
+        if(any_of(dontSplitAfter, [&](const auto& dont) { return dont == s; }))
+        {
+            std::string s2;
+            if(ss >> s2)
+            {
+                s += std::string(" ").append(s2); // Exactly one space is important.
+                rv.push_back(s);
+                continue;
+            }
+            throw std::runtime_error("Error parsing string: '" + in + '\'');
+        }
+        rv.push_back(s);
+    }
+    return rv;
+}
+
+inline std::vector<std::string> SplitDelim(const std::string& in, const char delim)
+{
+    std::vector<std::string> rv;
+    std::string token;
+    std::istringstream ss(in);
+
+    while(std::getline(ss, token, delim))
+    {
+        rv.push_back(token);
+    }
+    return rv;
+}
+
+} // namespace miopen
+
+#endif // GUARD_MIOPEN_STRINGUTILS_HPP
diff --git a/projects/miopen/driver/random.hpp b/projects/miopen/driver/random.hpp
index f6f8d85c4ce4..81e630411c67 100644
--- a/projects/miopen/driver/random.hpp
+++ b/projects/miopen/driver/random.hpp
@@ -1,159 +1,5 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2025 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
+// Forwarding header — implementation moved to common_utils.
 #ifndef GUARD_RANDOM_GEN_
 #define GUARD_RANDOM_GEN_
-
-#include <miopen/env.hpp>
-
-#include <cassert>
-#include <iostream>
-#include <random>
-
-MIOPEN_DECLARE_ENV_VAR_UINT64(MIOPEN_DEBUG_DRIVER_PRNG_SEED, 12345678)
-
-namespace env = miopen::env;
-
-namespace prng {
-namespace details {
-using glibc_gen = std::linear_congruential_engine<std::uint32_t, 1103515245, 12345, 2147483648>;
-
-inline std::random_device::result_type get_default_seed()
-{
-    static std::random_device::result_type seed{[] {
-        auto external_seed = env::value(MIOPEN_DEBUG_DRIVER_PRNG_SEED);
-
-        auto seed_ = external_seed == 0
-                         ? std::random_device{}()
-                         : static_cast<std::random_device::result_type>(external_seed);
-        std::cout << "PRNG seed: " << seed_ << "\n";
-        return seed_;
-    }()};
-    return seed;
-}
-
-inline glibc_gen& get_prng()
-{
-    static thread_local glibc_gen gen{get_default_seed()};
-    return gen;
-}
-
-template <class, class = void>
-struct has_digits : std::false_type
-{
-};
-
-template <class T>
-struct has_digits<T, std::void_t<decltype(std::numeric_limits<T>::digits)>> : std::true_type
-{
-};
-
-} // namespace details
-
-inline void reset_seed(std::random_device::result_type seed = 0)
-{
-    details::get_prng().seed(seed + details::get_default_seed());
-}
-
-// similar to std::generate_canonical, but simpler and faster
-template <typename T>
-inline T gen_canonical()
-{
-    if constexpr(std::is_floating_point_v<T>) // native fp
-    {
-        static constexpr T range =
-            static_cast<T>(1) /
-            static_cast<T>(details::glibc_gen::max() - details::glibc_gen::min() + 1);
-        return range * static_cast<T>(details::get_prng()() - details::glibc_gen::min());
-    }
-    else if constexpr(std::is_integral_v<T>)
-    {
-        auto val = details::get_prng()();
-        return static_cast<T>(((val >> 4) + (val >> 16)) & 0x1);
-    }
-    else
-    {
-        return static_cast<T>(gen_canonical<float>());
-    }
-}
-
-template <typename T>
-inline T gen_0_to_B(T B)
-{
-    if constexpr(std::is_floating_point_v<T>) // native fp
-    {
-        return gen_canonical<T>() * B;
-    }
-    else if constexpr(std::is_integral_v<T>)
-    {
-        // can only generate 27bit range, so it may not be suitable
-        // for huge 64 bit ranges, but we do not expect such ranges
-        return static_cast<T>((details::get_prng()() >> 4) % B);
-    }
-    else // half/bfloat/etc
-    {
-        return static_cast<T>(gen_0_to_B(static_cast<float>(B)));
-    }
-}
-
-template <typename T>
-inline T gen_A_to_B(T A, T B)
-{
-    assert(B > A);
-    return gen_0_to_B(B - A) + A;
-}
-
-template <typename T>
-inline T gen_off_range(T offset, T range)
-{
-    static_assert(std::is_integral_v<T>);
-    return prng::gen_0_to_B(range) + offset;
-}
-
-template <typename T, bool Signed = false>
-inline T gen_subnorm()
-{
-    T denorm_val = static_cast<T>(0);
-    if constexpr(!std::is_integral_v<T> && !std::is_same_v<T, double> &&
-                 std::is_trivially_copyable<T>::value && details::has_digits<T>::value)
-    {
-        using BitType = std::conditional_t<sizeof(T) == 1,
-                                           uint8_t,
-                                           std::conditional_t<sizeof(T) == 2, uint16_t, uint32_t>>;
-        static_assert(sizeof(T) == sizeof(BitType));
-
-        // -1 because ::digits counts the first implicit digit
-        static constexpr auto mantissa_bits = std::numeric_limits<T>::digits - 1;
-
-        BitType denorm_bits = static_cast<BitType>(gen_0_to_B(1 << mantissa_bits));
-        denorm_bits |= Signed ? (gen_canonical<BitType>() << (sizeof(T) * 8 - 1)) : 0;
-
-        // the proper way to do a type punning
-        std::memcpy(&denorm_val, &denorm_bits, sizeof(T));
-    }
-    return denorm_val;
-}
-} // namespace prng
-#endif // GUARD_RANDOM_GEN_
+#include <common_utils/random.hpp>
+#endif
diff --git a/projects/miopen/src/include/miopen/bfloat16.hpp b/projects/miopen/src/include/miopen/bfloat16.hpp
index 3e3a184a72d1..fc3880629c68 100644
--- a/projects/miopen/src/include/miopen/bfloat16.hpp
+++ b/projects/miopen/src/include/miopen/bfloat16.hpp
@@ -1,179 +1,5 @@
-// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
-// SPDX-License-Identifier: MIT
-
+// Forwarding header — implementation moved to common_utils.
 #ifndef BFLOAT16_H_
 #define BFLOAT16_H_
-
-#include <iostream>
-#include <miopen/config.h>
-
-class bfloat16
-{
-public:
-    bfloat16() : data_{0} {}
-    explicit bfloat16(float rhs)
-    {
-        union
-        {
-            float float_st;
-            std::uint32_t bf16_st;
-        } bits_st = {rhs};
-
-        // BF16 round and NaN preservation code matches
-        // https://github.com/ROCm/rocBLAS/blob/develop/library/include/rocblas_bfloat16.h
-        if((~bits_st.bf16_st & 0x7f800000) == 0) // Inf or NaN
-        {
-            // When all of the exponent bits are 1, the value is Inf or NaN.
-            // Inf is indicated by a zero mantissa. NaN is indicated by any nonzero
-            // mantissa bit. Quiet NaN is indicated by the most significant mantissa
-            // bit being 1. Signaling NaN is indicated by the most significant
-            // mantissa bit being 0 but some other bit(s) being 1. If any of the
-            // lower 16 bits of the mantissa are 1, we set the least significant bit
-            // of the bfloat16 mantissa, in order to preserve signaling NaN in case
-            // the bloat16's mantissa bits are all 0.
-            if((bits_st.bf16_st & 0xffff) != 0)
-            {
-                bits_st.bf16_st |= 0x10000; // Preserve signaling NaN
-            }
-        }
-        else
-        {
-#if MIOPEN_USE_RNE_BFLOAT16 == 1
-            // When the exponent bits are not all 1s, then the value is zero, normal,
-            // or subnormal. We round the bfloat16 mantissa up by adding 0x7FFF, plus
-            // 1 if the least significant bit of the bfloat16 mantissa is 1 (odd).
-            // This causes the bfloat16's mantissa to be incremented by 1 if the 16
-            // least significant bits of the float mantissa are greater than 0x8000,
-            // or if they are equal to 0x8000 and the least significant bit of the
-            // bfloat16 mantissa is 1 (odd). This causes it to be rounded to even when
-            // the lower 16 bits are exactly 0x8000. If the bfloat16 mantissa already
-            // has the value 0x7f, then incrementing it causes it to become 0x00 and
-            // the exponent is incremented by one, which is the next higher FP value
-            // to the unrounded bfloat16 value. When the bfloat16 value is subnormal
-            // with an exponent of 0x00 and a mantissa of 0x7F, it may be rounded up
-            // to a normal value with an exponent of 0x01 and a mantissa of 0x00.
-            // When the bfloat16 value has an exponent of 0xFE and a mantissa of 0x7F,
-            // incrementing it causes it to become an exponent of 0xFF and a mantissa
-            // of 0x00, which is Inf, the next higher value to the unrounded value.
-            bits_st.bf16_st +=
-                (0x7fff + ((bits_st.bf16_st >> 16) & 1)); // Round to nearest, round to even
-#else                                                     // truncation
-// do nothing
-#endif
-        }
-        data_ = bits_st.bf16_st >> 16;
-    }
-    operator float() const
-    {
-        union
-        {
-            std::uint32_t bf16_st;
-            float float_st;
-        } bits_st = {data_};
-
-        bits_st.bf16_st = bits_st.bf16_st << 16;
-        return bits_st.float_st;
-    }
-
-    bfloat16 operator-() const { return bfloat16(-static_cast<float>(*this)); }
-    bfloat16 operator+() const { return *this; }
-
-    bfloat16& operator=(const float rhs)
-    {
-        *this = bfloat16(rhs);
-        return *this;
-    }
-    bfloat16& operator+=(bfloat16 rhs)
-    {
-        *this = bfloat16(static_cast<float>(*this) + static_cast<float>(rhs));
-        return *this;
-    }
-
-    bfloat16& operator+=(float rhs)
-    {
-        *this = bfloat16(static_cast<float>(*this) + rhs);
-        return *this;
-    }
-
-    bfloat16& operator-=(bfloat16 rhs)
-    {
-        *this += -rhs;
-        return *this;
-    }
-    bfloat16& operator*=(bfloat16 rhs)
-    {
-        *this = bfloat16(static_cast<float>(*this) * static_cast<float>(rhs));
-        return *this;
-    }
-    bfloat16& operator*=(float rhs)
-    {
-        *this = bfloat16(static_cast<float>(*this) * rhs);
-        return *this;
-    }
-
-    bfloat16& operator/=(bfloat16 rhs)
-    {
-        *this = bfloat16(static_cast<float>(*this) / static_cast<float>(rhs));
-        return *this;
-    }
-    bool operator<(bfloat16 rhs) const
-    {
-        return static_cast<float>(*this) < static_cast<float>(rhs);
-    }
-    bool operator==(bfloat16 rhs) const { return std::equal_to<float>()(*this, rhs); }
-
-    static constexpr bfloat16 generate(uint16_t val) { return bfloat16{val, true}; }
-
-private:
-    constexpr bfloat16(std::uint16_t val, bool) : data_{val} {}
-
-    std::uint16_t data_;
-};
-
-inline bfloat16 operator+(bfloat16 a, const bfloat16& b)
-{
-    a += b;
-    return a;
-}
-
-inline bfloat16 operator-(bfloat16 a, const bfloat16& b)
-{
-    a -= b;
-    return a;
-}
-
-inline bfloat16 operator*(bfloat16 a, const bfloat16& b)
-{
-    a *= b;
-    return a;
-}
-
-inline bfloat16 operator/(bfloat16 a, const bfloat16& b)
-{
-    a /= b;
-    return a;
-}
-
-namespace std {
-template <>
-class numeric_limits<bfloat16>
-{
-public:
-    static constexpr bool is_specialized = true;
-    static constexpr bfloat16 min() noexcept { return bfloat16::generate(0x0080); } // 0x1.00p-126
-    static constexpr bfloat16 max() noexcept { return bfloat16::generate(0x7F7F); }
-    static constexpr bfloat16 lowest() noexcept { return bfloat16::generate(0xFF7F); }
-    static constexpr bfloat16 epsilon() noexcept { return bfloat16::generate(0x3C00); }
-    static constexpr bfloat16 infinity() noexcept { return bfloat16::generate(0x7F80); }
-    static constexpr bfloat16 quiet_NaN() noexcept { return bfloat16::generate(0x7FC0); } // qnan(0)
-    static constexpr bfloat16 signaling_NaN() noexcept
-    {
-        return bfloat16::generate(0x7F81); // snan(1)
-    }
-    static constexpr bfloat16 denorm_min() noexcept
-    {
-        return bfloat16::generate(0x0001); // 0x0.02p-126
-    }
-};
-} // namespace std
+#include <common_utils/bfloat16.hpp>
 #endif
diff --git a/projects/miopen/src/include/miopen/reduce_common.hpp b/projects/miopen/src/include/miopen/reduce_common.hpp
index 37b92e727d92..f1bd0b38e320 100644
--- a/projects/miopen/src/include/miopen/reduce_common.hpp
+++ b/projects/miopen/src/include/miopen/reduce_common.hpp
@@ -1,66 +1,5 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2020 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
+// Forwarding header — implementation moved to common_utils.
 #ifndef GUARD_MIOPEN_REDUCE_COMMON_HPP
 #define GUARD_MIOPEN_REDUCE_COMMON_HPP
-
-#include <half/half.hpp>
-#include <miopen/bfloat16.hpp>
-
-namespace reduce {
-
-template <typename Tdst, typename Tsrc>
-static inline Tdst convert_type(Tsrc x)
-{
-    return static_cast<Tdst>(x);
-}
-
-template <>
-inline float convert_type<float>(half_float::half x)
-{
-    return half_float::half_cast<float>(x);
-};
-
-template <>
-inline half_float::half convert_type<half_float::half>(float x)
-{
-    return half_float::half_cast<half_float::half>(x);
-};
-
-template <>
-inline float convert_type<float>(bfloat16 x)
-{
-    return float(x);
-};
-
-template <>
-inline bfloat16 convert_type<bfloat16>(float x)
-{
-    return bfloat16(x);
-};
-
-}; // end of namespace reduce
-
+#include <common_utils/reduce_common.hpp>
 #endif
diff --git a/projects/miopen/src/include/miopen/stringutils.hpp b/projects/miopen/src/include/miopen/stringutils.hpp
index 5a412416d666..38f52efd1cf6 100644
--- a/projects/miopen/src/include/miopen/stringutils.hpp
+++ b/projects/miopen/src/include/miopen/stringutils.hpp
@@ -1,165 +1,5 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2017 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
+// Forwarding header — implementation moved to common_utils.
 #ifndef GUARD_MIOPEN_STRINGUTILS_HPP
 #define GUARD_MIOPEN_STRINGUTILS_HPP
-
-#include <miopen/algorithm.hpp>
-#include <miopen/errors.hpp>
-#include <algorithm>
-#include <iterator>
-#include <numeric>
-#include <string>
-#include <vector>
-#include <sstream>
-
-#define MIOPEN_STRINGIZE_1(...) #__VA_ARGS__
-#define MIOPEN_STRINGIZE(...) MIOPEN_STRINGIZE_1(__VA_ARGS__)
-
-namespace miopen {
-
-inline std::string
-ReplaceString(const std::string& in, const std::string& search, const std::string& replace)
-{
-    size_t pos = 0;
-    std::string subject(in);
-    while((pos = subject.find(search, pos)) != std::string::npos)
-    {
-        subject.replace(pos, search.length(), replace);
-        pos += replace.length();
-    }
-    return subject;
-}
-
-inline bool EndsWith(const std::string& value, const std::string& suffix)
-{
-    if(suffix.size() > value.size())
-        return false;
-    else
-        return std::equal(suffix.rbegin(), suffix.rend(), value.rbegin());
-}
-
-template <class Strings>
-inline std::string JoinStrings(Strings strings, std::string delim)
-{
-    auto it = strings.begin();
-    if(it == strings.end())
-        return "";
-
-    auto nit = std::next(it);
-    return std::accumulate(
-        nit, strings.end(), *it, [&](std::string x, std::string y) { return x + delim + y; });
-}
-
-template <class F>
-static inline std::string TransformString(std::string s, F f)
-{
-    std::transform(s.begin(), s.end(), s.begin(), f);
-    return s;
-}
-
-inline std::string ToUpper(std::string s) { return TransformString(std::move(s), ::toupper); }
-
-inline bool StartsWith(const std::string& value, const std::string& prefix)
-{
-    if(prefix.size() > value.size())
-        return false;
-    else
-        return std::equal(prefix.begin(), prefix.end(), value.begin());
-}
-
-inline std::string RemovePrefix(std::string s, std::string prefix)
-{
-    if(StartsWith(s, prefix))
-        return s.substr(prefix.length());
-    else
-        return s;
-}
-
-inline std::vector<std::string> SplitSpaceSeparated(const std::string& in)
-{
-    std::istringstream ss(in);
-    const std::istream_iterator<std::string> begin(ss), end;
-    return {begin, end};
-}
-
-inline std::vector<std::string> SplitSpaceSeparated(const std::vector<std::string>& in)
-{
-    std::vector<std::string> rv;
-    for(const auto& item : in)
-    {
-        if(item.find(' ') != std::string::npos)
-        {
-            const auto splitted = SplitSpaceSeparated(item);
-            std::copy(splitted.begin(), splitted.end(), std::back_inserter(rv));
-        }
-        else
-        {
-            rv.emplace_back(item);
-        }
-    }
-    return rv;
-}
-
-inline std::vector<std::string> SplitSpaceSeparated(const std::string& in,
-                                                    const std::vector<std::string>& dontSplitAfter)
-{
-    std::vector<std::string> rv;
-    std::istringstream ss(in);
-    std::string s;
-    while(ss >> s)
-    {
-        if(any_of(dontSplitAfter, [&](const auto& dont) { return dont == s; }))
-        {
-            std::string s2;
-            if(ss >> s2)
-            {
-                s += std::string(" ").append(s2); // Exactly one space is important.
-                rv.push_back(s);
-                continue;
-            }
-            MIOPEN_THROW("Error parsing string: '" + in + '\'');
-        }
-        rv.push_back(s);
-    }
-    return rv;
-}
-
-inline std::vector<std::string> SplitDelim(const std::string& in, const char delim)
-{
-    std::vector<std::string> rv;
-    std::string token;
-    std::istringstream ss(in);
-
-    while(std::getline(ss, token, delim))
-    {
-        rv.push_back(token);
-    }
-    return rv;
-}
-
-} // namespace miopen
-
-#endif // GUARD_MIOPEN_STRINGUTILS_HPP
+#include <common_utils/stringutils.hpp>
+#endif

From 295374db784d40c334be5f4e544e7d803d28666a Mon Sep 17 00:00:00 2001
From: Brad Pepers <brad.pepers@amd.com>
Date: Sat, 9 May 2026 07:32:18 -0600
Subject: [PATCH 03/11] Create miopen_utils library and move shared
 test/verification code

Move 12 headers from test/ to miopen_utils/include/miopen_utils/:
- tensor_holder.hpp, verify.hpp (used by 30+ driver files)
- cpu_conv.hpp, cpu_bias.hpp, cpu_layernorm.hpp (CPU reference)
- fusionHost.hpp, gemm.hpp, cpu_reduce_util.hpp, rnn_util.hpp
- random.hpp (test initializers)
- serialize.hpp, network_data.hpp (tensor_holder dependencies)

Include cleanup:
- Removed unused #include "test.hpp" from cpu_conv.hpp, cpu_bias.hpp
- Removed unused #include "get_handle.hpp" from fusionHost.hpp
- Updated all internal cross-references to use <miopen_utils/> and
  <common_utils/> paths

Updated 35 driver files to include from <miopen_utils/> instead of
<../test/>.

Forwarding headers left at original test/ locations for backward
compatibility with existing test code.

Result: driver/ no longer includes from test/, and miopen_utils/ no
longer includes from driver/ or test/.

Co-Authored-By: Claude Sonnet 4 <noreply@anthropic.com>
---
 projects/miopen/CMakeLists.txt                |   1 +
 .../miopen/driver/CBAInferFusion_driver.hpp   |   6 +-
 projects/miopen/driver/CMakeLists.txt         |   2 +-
 projects/miopen/driver/adam_driver.hpp        |   2 +-
 .../miopen/driver/addlayernorm_driver.hpp     |   4 +-
 projects/miopen/driver/bn_driver.hpp          |   6 +-
 projects/miopen/driver/cat_driver.hpp         |   4 +-
 projects/miopen/driver/conv_driver.hpp        |   8 +-
 projects/miopen/driver/conv_verify.hpp        |   2 +-
 projects/miopen/driver/ctc_driver.hpp         |   2 +-
 projects/miopen/driver/driver.hpp             |   2 +-
 projects/miopen/driver/dropout_driver.hpp     |   2 +-
 projects/miopen/driver/gemm_driver.hpp        |   2 +-
 projects/miopen/driver/getitem_driver.hpp     |   4 +-
 projects/miopen/driver/glu_driver.hpp         |   2 +-
 projects/miopen/driver/groupnorm_driver.hpp   |   4 +-
 projects/miopen/driver/gru_verify_gemm.hpp    |   2 +-
 projects/miopen/driver/kthvalue_driver.hpp    |   4 +-
 projects/miopen/driver/layernorm_driver.hpp   |   6 +-
 projects/miopen/driver/lrn_driver.hpp         |   2 +-
 projects/miopen/driver/lstm_verify_gemm.hpp   |   2 +-
 projects/miopen/driver/miopen_Reduction.hpp   |   2 +-
 .../miopen/driver/multimarginloss_driver.hpp  |   4 +-
 projects/miopen/driver/prelu_driver.hpp       |   2 +-
 projects/miopen/driver/reduce_driver.hpp      |   2 +-
 .../driver/reducecalculation_driver.hpp       |   4 +-
 .../miopen/driver/reduceextreme_driver.hpp    |   4 +-
 projects/miopen/driver/rnn_driver.hpp         |   2 +-
 projects/miopen/driver/rnn_seq_driver.hpp     |   2 +-
 projects/miopen/driver/rnn_verify_gemm.hpp    |   2 +-
 projects/miopen/driver/rope_driver.hpp        |   4 +-
 .../miopen/driver/softmarginloss_driver.hpp   |   4 +-
 projects/miopen/driver/softmax_driver.hpp     |   2 +-
 projects/miopen/driver/t5layernorm_driver.hpp |   4 +-
 .../driver/transformers_adam_w_driver.hpp     |   2 +-
 projects/miopen/miopen_utils/CMakeLists.txt   |  38 +
 .../include/miopen_utils/cpu_bias.hpp         | 140 +++
 .../include/miopen_utils/cpu_conv.hpp         | 514 +++++++++
 .../include/miopen_utils/cpu_layernorm.hpp    | 216 ++++
 .../include/miopen_utils/cpu_reduce_util.hpp  | 649 ++++++++++++
 .../include/miopen_utils/fusionHost.hpp       | 993 +++++++++++++++++
 .../include/miopen_utils/gemm.hpp             | 120 +++
 .../include/miopen_utils/network_data.hpp     | 438 ++++++++
 .../include/miopen_utils/random.hpp           |  62 ++
 .../include/miopen_utils/rnn_util.hpp         | 305 ++++++
 .../include/miopen_utils/serialize.hpp        | 129 +++
 .../include/miopen_utils/tensor_holder.hpp    | 505 +++++++++
 .../include/miopen_utils/verify.hpp           | 245 +++++
 projects/miopen/test/CMakeLists.txt           |   4 +-
 projects/miopen/test/cpu_bias.hpp             | 140 +--
 projects/miopen/test/cpu_conv.hpp             | 514 +--------
 projects/miopen/test/cpu_layernorm.hpp        | 215 +---
 projects/miopen/test/cpu_reduce_util.hpp      | 648 +-----------
 projects/miopen/test/fusionHost.hpp           | 995 +-----------------
 projects/miopen/test/gemm.hpp                 | 119 +--
 projects/miopen/test/network_data.hpp         | 437 +-------
 projects/miopen/test/random.hpp               |  63 +-
 projects/miopen/test/rnn_util.hpp             | 304 +-----
 projects/miopen/test/serialize.hpp            | 128 +--
 projects/miopen/test/tensor_holder.hpp        | 504 +--------
 projects/miopen/test/verify.hpp               | 244 +----
 61 files changed, 4436 insertions(+), 4342 deletions(-)
 create mode 100644 projects/miopen/miopen_utils/CMakeLists.txt
 create mode 100644 projects/miopen/miopen_utils/include/miopen_utils/cpu_bias.hpp
 create mode 100644 projects/miopen/miopen_utils/include/miopen_utils/cpu_conv.hpp
 create mode 100644 projects/miopen/miopen_utils/include/miopen_utils/cpu_layernorm.hpp
 create mode 100644 projects/miopen/miopen_utils/include/miopen_utils/cpu_reduce_util.hpp
 create mode 100644 projects/miopen/miopen_utils/include/miopen_utils/fusionHost.hpp
 create mode 100644 projects/miopen/miopen_utils/include/miopen_utils/gemm.hpp
 create mode 100644 projects/miopen/miopen_utils/include/miopen_utils/network_data.hpp
 create mode 100644 projects/miopen/miopen_utils/include/miopen_utils/random.hpp
 create mode 100644 projects/miopen/miopen_utils/include/miopen_utils/rnn_util.hpp
 create mode 100644 projects/miopen/miopen_utils/include/miopen_utils/serialize.hpp
 create mode 100644 projects/miopen/miopen_utils/include/miopen_utils/tensor_holder.hpp
 create mode 100644 projects/miopen/miopen_utils/include/miopen_utils/verify.hpp

diff --git a/projects/miopen/CMakeLists.txt b/projects/miopen/CMakeLists.txt
index 627ddec85bbd..26bf20fd0690 100644
--- a/projects/miopen/CMakeLists.txt
+++ b/projects/miopen/CMakeLists.txt
@@ -897,6 +897,7 @@ endif()
 add_subdirectory(common_utils)
 add_subdirectory(addkernels)
 add_subdirectory(src)
+add_subdirectory(miopen_utils)
 if(MIOPEN_BUILD_DRIVER)
     add_subdirectory(driver)
 endif()
diff --git a/projects/miopen/driver/CBAInferFusion_driver.hpp b/projects/miopen/driver/CBAInferFusion_driver.hpp
index 0b63f8fe5af6..8bc25e1ffc58 100644
--- a/projects/miopen/driver/CBAInferFusion_driver.hpp
+++ b/projects/miopen/driver/CBAInferFusion_driver.hpp
@@ -36,9 +36,9 @@
 #include "util_driver.hpp"
 #include "conv_common.hpp"
 
-#include "../test/verify.hpp"
-#include "../test/cpu_conv.hpp"
-#include "../test/cpu_bias.hpp"
+#include <miopen_utils/verify.hpp>
+#include <miopen_utils/cpu_conv.hpp>
+#include <miopen_utils/cpu_bias.hpp>
 
 #include <miopen/env.hpp>
 #include <miopen/errors.hpp>
diff --git a/projects/miopen/driver/CMakeLists.txt b/projects/miopen/driver/CMakeLists.txt
index 693a3d47d599..835d6437b650 100644
--- a/projects/miopen/driver/CMakeLists.txt
+++ b/projects/miopen/driver/CMakeLists.txt
@@ -74,7 +74,7 @@ endif()
 add_dependencies(MIOpenDriver generate_kernels)
 target_include_directories(MIOpenDriver PRIVATE ../src/kernels)
 # MIOpen_with_plugins ensures CK plugin .so's are built alongside MIOpenDriver
-target_link_libraries(MIOpenDriver PRIVATE MIOpen_with_plugins Threads::Threads roc::rocrand nlohmann_json::nlohmann_json miopen_common_utils)
+target_link_libraries(MIOpenDriver PRIVATE MIOpen_with_plugins Threads::Threads roc::rocrand nlohmann_json::nlohmann_json miopen_common_utils miopen_utils)
 if(NOT MIOPEN_EMBED_DB STREQUAL "")
     target_link_libraries(MIOpenDriver PRIVATE $<BUILD_INTERFACE:miopen_data> )
 endif()
diff --git a/projects/miopen/driver/adam_driver.hpp b/projects/miopen/driver/adam_driver.hpp
index f0c0258c8241..6c1984c44e87 100644
--- a/projects/miopen/driver/adam_driver.hpp
+++ b/projects/miopen/driver/adam_driver.hpp
@@ -32,7 +32,7 @@
 #include "tensor_driver.hpp"
 #include "timer.hpp"
 
-#include "../test/verify.hpp"
+#include <miopen_utils/verify.hpp>
 
 #include <miopen/float_equal.hpp>
 #include <miopen/ford.hpp>
diff --git a/projects/miopen/driver/addlayernorm_driver.hpp b/projects/miopen/driver/addlayernorm_driver.hpp
index effdc90c6127..a1bac6125dfc 100644
--- a/projects/miopen/driver/addlayernorm_driver.hpp
+++ b/projects/miopen/driver/addlayernorm_driver.hpp
@@ -26,8 +26,8 @@
 #ifndef GUARD_MIOPEN_ADDLAYERNORM_DRIVER_HPP
 #define GUARD_MIOPEN_ADDLAYERNORM_DRIVER_HPP
 
-#include <../test/tensor_holder.hpp>
-#include <../test/verify.hpp>
+#include <miopen_utils/tensor_holder.hpp>
+#include <miopen_utils/verify.hpp>
 #include "InputFlags.hpp"
 #include "driver.hpp"
 #include "random.hpp"
diff --git a/projects/miopen/driver/bn_driver.hpp b/projects/miopen/driver/bn_driver.hpp
index 29cdfd970356..82802f8bd965 100644
--- a/projects/miopen/driver/bn_driver.hpp
+++ b/projects/miopen/driver/bn_driver.hpp
@@ -35,9 +35,9 @@
 #include "util_driver.hpp"
 #include "rocrand_wrapper.hpp"
 
-#include "../test/verify.hpp"
-#include "../test/random.hpp"
-#include "../test/fusionHost.hpp"
+#include <miopen_utils/verify.hpp>
+#include <miopen_utils/random.hpp>
+#include <miopen_utils/fusionHost.hpp>
 
 #include <miopen/errors.hpp>
 #include <miopen/handle.hpp>
diff --git a/projects/miopen/driver/cat_driver.hpp b/projects/miopen/driver/cat_driver.hpp
index f9a675440c15..a4e6804f9aad 100644
--- a/projects/miopen/driver/cat_driver.hpp
+++ b/projects/miopen/driver/cat_driver.hpp
@@ -18,8 +18,8 @@
 #include <miopen/tensor.hpp>
 #include <numeric>
 #include <vector>
-#include <../test/tensor_holder.hpp>
-#include <../test/verify.hpp>
+#include <miopen_utils/tensor_holder.hpp>
+#include <miopen_utils/verify.hpp>
 #include <miopen/ford.hpp>
 
 #ifndef MLO_CATHOST_H_
diff --git a/projects/miopen/driver/conv_driver.hpp b/projects/miopen/driver/conv_driver.hpp
index fcdbdbbd2ea6..77010d71e87a 100644
--- a/projects/miopen/driver/conv_driver.hpp
+++ b/projects/miopen/driver/conv_driver.hpp
@@ -28,10 +28,10 @@
 #include <miopen/conv/solvers.hpp>
 #include <miopen/tensor.hpp>
 
-#include <../test/cpu_bias.hpp>
-#include <../test/cpu_conv.hpp>
-#include <../test/tensor_holder.hpp>
-#include <../test/verify.hpp>
+#include <miopen_utils/cpu_bias.hpp>
+#include <miopen_utils/cpu_conv.hpp>
+#include <miopen_utils/tensor_holder.hpp>
+#include <miopen_utils/verify.hpp>
 
 #include <algorithm>
 #include <cstdlib>
diff --git a/projects/miopen/driver/conv_verify.hpp b/projects/miopen/driver/conv_verify.hpp
index ae315843f01e..31d611bce134 100644
--- a/projects/miopen/driver/conv_verify.hpp
+++ b/projects/miopen/driver/conv_verify.hpp
@@ -27,7 +27,7 @@
 #define GUARD_MIOPEN_CONV_VERIFY_HPP
 
 #include <cassert>
-#include "../test/gemm.hpp"
+#include <miopen_utils/gemm.hpp>
 
 template <typename Tgpu_ /* the data type used in GPU computations (usually half) */,
           typename Tcheck_ /* the data type used in CPU checkings (usually double) */>
diff --git a/projects/miopen/driver/ctc_driver.hpp b/projects/miopen/driver/ctc_driver.hpp
index 2b8e64a8f79a..85aecb3264d3 100644
--- a/projects/miopen/driver/ctc_driver.hpp
+++ b/projects/miopen/driver/ctc_driver.hpp
@@ -35,7 +35,7 @@
 
 #include <miopen/miopen.h>
 
-#include <../test/verify.hpp>
+#include <miopen_utils/verify.hpp>
 
 #include <algorithm>
 #include <array>
diff --git a/projects/miopen/driver/driver.hpp b/projects/miopen/driver/driver.hpp
index 5bb698554566..2ebbcc2a4000 100644
--- a/projects/miopen/driver/driver.hpp
+++ b/projects/miopen/driver/driver.hpp
@@ -39,7 +39,7 @@
 #include <miopen/miopen.h>
 #include <miopen/bfloat16.hpp>
 #include <miopen/handle.hpp>
-#include <../test/tensor_holder.hpp>
+#include <miopen_utils/tensor_holder.hpp>
 #include "util_driver.hpp"
 #include "rocrand_wrapper.hpp"
 using half         = half_float::half;
diff --git a/projects/miopen/driver/dropout_driver.hpp b/projects/miopen/driver/dropout_driver.hpp
index 84d942155a08..0016340fd60e 100644
--- a/projects/miopen/driver/dropout_driver.hpp
+++ b/projects/miopen/driver/dropout_driver.hpp
@@ -34,7 +34,7 @@
 #include "util_driver.hpp"
 #include "util_file.hpp"
 
-#include <../test/verify.hpp>
+#include <miopen_utils/verify.hpp>
 
 #include <miopen/dropout.hpp>
 #include <miopen/miopen.h>
diff --git a/projects/miopen/driver/gemm_driver.hpp b/projects/miopen/driver/gemm_driver.hpp
index d89a09a56644..8383b01ec22f 100644
--- a/projects/miopen/driver/gemm_driver.hpp
+++ b/projects/miopen/driver/gemm_driver.hpp
@@ -34,7 +34,7 @@
 #include "random.hpp"
 #include "util_driver.hpp"
 
-#include <../test/verify.hpp>
+#include <miopen_utils/verify.hpp>
 
 #include <miopen/gemm_v2.hpp>
 #include <miopen/miopen.h>
diff --git a/projects/miopen/driver/getitem_driver.hpp b/projects/miopen/driver/getitem_driver.hpp
index 52a5bc262f82..55b0dfcd296c 100644
--- a/projects/miopen/driver/getitem_driver.hpp
+++ b/projects/miopen/driver/getitem_driver.hpp
@@ -40,8 +40,8 @@
 #include <miopen/tensor_view_utils.hpp>
 #include <numeric>
 #include <vector>
-#include <../test/tensor_holder.hpp>
-#include <../test/verify.hpp>
+#include <miopen_utils/tensor_holder.hpp>
+#include <miopen_utils/verify.hpp>
 
 template <typename Tgpu, typename Tcheck>
 int32_t mloGetitemBackwardRunHost(miopenTensorDescriptor_t dyDesc,
diff --git a/projects/miopen/driver/glu_driver.hpp b/projects/miopen/driver/glu_driver.hpp
index 38deb2d69e78..63bf7188db4d 100644
--- a/projects/miopen/driver/glu_driver.hpp
+++ b/projects/miopen/driver/glu_driver.hpp
@@ -38,7 +38,7 @@
 #include <memory>
 #include <vector>
 
-#include <../test/verify.hpp>
+#include <miopen_utils/verify.hpp>
 
 #include <miopen/errors.hpp>
 #include <miopen/miopen.h>
diff --git a/projects/miopen/driver/groupnorm_driver.hpp b/projects/miopen/driver/groupnorm_driver.hpp
index 3773654c842d..97553dd3c13e 100644
--- a/projects/miopen/driver/groupnorm_driver.hpp
+++ b/projects/miopen/driver/groupnorm_driver.hpp
@@ -32,7 +32,7 @@
 #include "mloGroupNormHost.hpp"
 #include "tensor_driver.hpp"
 #include "timer.hpp"
-#include <../test/verify.hpp>
+#include <miopen_utils/verify.hpp>
 #include <algorithm>
 #include <cstdlib>
 #include <cfloat>
@@ -40,7 +40,7 @@
 #include <miopen/tensor.hpp>
 #include <numeric>
 #include <vector>
-#include <../test/tensor_holder.hpp>
+#include <miopen_utils/tensor_holder.hpp>
 #include "random.hpp"
 
 template <typename Tgpu, typename Tref>
diff --git a/projects/miopen/driver/gru_verify_gemm.hpp b/projects/miopen/driver/gru_verify_gemm.hpp
index e07d6eab0bff..237d311b1c29 100644
--- a/projects/miopen/driver/gru_verify_gemm.hpp
+++ b/projects/miopen/driver/gru_verify_gemm.hpp
@@ -28,7 +28,7 @@
 
 #include "dropout_gpu_emulator.hpp"
 
-#include <../test/rnn_util.hpp>
+#include <miopen_utils/rnn_util.hpp>
 
 #include <algorithm>
 #include <cassert>
diff --git a/projects/miopen/driver/kthvalue_driver.hpp b/projects/miopen/driver/kthvalue_driver.hpp
index 75f7e5b535b2..8cbfa302bf14 100644
--- a/projects/miopen/driver/kthvalue_driver.hpp
+++ b/projects/miopen/driver/kthvalue_driver.hpp
@@ -30,8 +30,8 @@
 #include "timer.hpp"
 #include "random.hpp"
 
-#include <../test/tensor_holder.hpp>
-#include <../test/verify.hpp>
+#include <miopen_utils/tensor_holder.hpp>
+#include <miopen_utils/verify.hpp>
 
 #include <miopen/tensor_view_utils.hpp>
 #include <miopen/miopen.h>
diff --git a/projects/miopen/driver/layernorm_driver.hpp b/projects/miopen/driver/layernorm_driver.hpp
index 6f6662f202f6..042e8a7164ea 100644
--- a/projects/miopen/driver/layernorm_driver.hpp
+++ b/projects/miopen/driver/layernorm_driver.hpp
@@ -26,9 +26,9 @@
 #ifndef GUARD_MIOPEN_LAYERNORM_DRIVER_HPP
 #define GUARD_MIOPEN_LAYERNORM_DRIVER_HPP
 
-#include <../test/tensor_holder.hpp>
-#include <../test/verify.hpp>
-#include <../test/cpu_layernorm.hpp>
+#include <miopen_utils/tensor_holder.hpp>
+#include <miopen_utils/verify.hpp>
+#include <miopen_utils/cpu_layernorm.hpp>
 #include "InputFlags.hpp"
 #include "driver.hpp"
 #include "miopen/miopen.h"
diff --git a/projects/miopen/driver/lrn_driver.hpp b/projects/miopen/driver/lrn_driver.hpp
index c1645621acd4..2f164aad38b1 100644
--- a/projects/miopen/driver/lrn_driver.hpp
+++ b/projects/miopen/driver/lrn_driver.hpp
@@ -12,7 +12,7 @@
 #include "timer.hpp"
 #include "util_driver.hpp"
 
-#include "../test/verify.hpp"
+#include <miopen_utils/verify.hpp>
 
 #include <miopen/errors.hpp>
 #include <miopen/miopen.h>
diff --git a/projects/miopen/driver/lstm_verify_gemm.hpp b/projects/miopen/driver/lstm_verify_gemm.hpp
index fb98d5616ad5..a761779738f4 100644
--- a/projects/miopen/driver/lstm_verify_gemm.hpp
+++ b/projects/miopen/driver/lstm_verify_gemm.hpp
@@ -28,7 +28,7 @@
 
 #include "dropout_gpu_emulator.hpp"
 
-#include <../test/rnn_util.hpp>
+#include <miopen_utils/rnn_util.hpp>
 
 #include <algorithm>
 #include <cassert>
diff --git a/projects/miopen/driver/miopen_Reduction.hpp b/projects/miopen/driver/miopen_Reduction.hpp
index 3aee4e375c97..0fc05603bf2e 100644
--- a/projects/miopen/driver/miopen_Reduction.hpp
+++ b/projects/miopen/driver/miopen_Reduction.hpp
@@ -31,7 +31,7 @@
 #include <cassert>
 #include <cmath>
 
-#include "../test/cpu_reduce_util.hpp"
+#include <miopen_utils/cpu_reduce_util.hpp>
 
 #include "tensor_driver.hpp"
 
diff --git a/projects/miopen/driver/multimarginloss_driver.hpp b/projects/miopen/driver/multimarginloss_driver.hpp
index dab040ef3ef3..5d2a60db4507 100644
--- a/projects/miopen/driver/multimarginloss_driver.hpp
+++ b/projects/miopen/driver/multimarginloss_driver.hpp
@@ -36,8 +36,8 @@
 #include <miopen/miopen.h>
 #include <miopen/tensor.hpp>
 #include <vector>
-#include <../test/tensor_holder.hpp>
-#include <../test/verify.hpp>
+#include <miopen_utils/tensor_holder.hpp>
+#include <miopen_utils/verify.hpp>
 #include <miopen/tensor_view_utils.hpp>
 
 template <typename Tgpu, typename Tcheck>
diff --git a/projects/miopen/driver/prelu_driver.hpp b/projects/miopen/driver/prelu_driver.hpp
index 761f97cc64eb..cab2eb811885 100644
--- a/projects/miopen/driver/prelu_driver.hpp
+++ b/projects/miopen/driver/prelu_driver.hpp
@@ -31,7 +31,7 @@
 #include "tensor_driver.hpp"
 #include "timer.hpp"
 
-#include <../test/verify.hpp>
+#include <miopen_utils/verify.hpp>
 
 #include <miopen/miopen.h>
 
diff --git a/projects/miopen/driver/reduce_driver.hpp b/projects/miopen/driver/reduce_driver.hpp
index ab1c50e806f1..6300fa32a690 100644
--- a/projects/miopen/driver/reduce_driver.hpp
+++ b/projects/miopen/driver/reduce_driver.hpp
@@ -35,7 +35,7 @@
 #include "util_driver.hpp"
 #include "util_file.hpp"
 
-#include "../test/verify.hpp"
+#include <miopen_utils/verify.hpp>
 
 #include <miopen/miopen.h>
 #include <memory>
diff --git a/projects/miopen/driver/reducecalculation_driver.hpp b/projects/miopen/driver/reducecalculation_driver.hpp
index 200196950997..738fb6032f3c 100644
--- a/projects/miopen/driver/reducecalculation_driver.hpp
+++ b/projects/miopen/driver/reducecalculation_driver.hpp
@@ -40,8 +40,8 @@
 #include <miopen/tensor.hpp>
 #include <numeric>
 #include <vector>
-#include <../test/tensor_holder.hpp>
-#include <../test/verify.hpp>
+#include <miopen_utils/tensor_holder.hpp>
+#include <miopen_utils/verify.hpp>
 #include "../src/kernels/MIOpenReduceCalculation.hpp"
 
 #ifndef MLO_REDUCE_CALCULATIONMHOST_H_
diff --git a/projects/miopen/driver/reduceextreme_driver.hpp b/projects/miopen/driver/reduceextreme_driver.hpp
index a06f5288a164..b2caf5dda398 100644
--- a/projects/miopen/driver/reduceextreme_driver.hpp
+++ b/projects/miopen/driver/reduceextreme_driver.hpp
@@ -39,8 +39,8 @@
 #include <miopen/tensor.hpp>
 #include <numeric>
 #include <vector>
-#include <../test/tensor_holder.hpp>
-#include <../test/verify.hpp>
+#include <miopen_utils/tensor_holder.hpp>
+#include <miopen_utils/verify.hpp>
 #include "../src/kernels/MIOpenReduceExtreme.hpp"
 
 template <typename T>
diff --git a/projects/miopen/driver/rnn_driver.hpp b/projects/miopen/driver/rnn_driver.hpp
index 4cd47739f5ea..7f35be320155 100644
--- a/projects/miopen/driver/rnn_driver.hpp
+++ b/projects/miopen/driver/rnn_driver.hpp
@@ -36,7 +36,7 @@
 #include "util_driver.hpp"
 #include "util_file.hpp"
 
-#include <../test/verify.hpp>
+#include <miopen_utils/verify.hpp>
 
 #include <miopen/errors.hpp>
 #include <miopen/miopen.h>
diff --git a/projects/miopen/driver/rnn_seq_driver.hpp b/projects/miopen/driver/rnn_seq_driver.hpp
index 1ac9b23c0b4c..7babcfd00273 100644
--- a/projects/miopen/driver/rnn_seq_driver.hpp
+++ b/projects/miopen/driver/rnn_seq_driver.hpp
@@ -36,7 +36,7 @@
 #include "util_driver.hpp"
 #include "util_file.hpp"
 
-#include <../test/verify.hpp>
+#include <miopen_utils/verify.hpp>
 
 #include <miopen/errors.hpp>
 #include <miopen/logger.hpp>
diff --git a/projects/miopen/driver/rnn_verify_gemm.hpp b/projects/miopen/driver/rnn_verify_gemm.hpp
index b1fa42c3503b..04b73111513d 100644
--- a/projects/miopen/driver/rnn_verify_gemm.hpp
+++ b/projects/miopen/driver/rnn_verify_gemm.hpp
@@ -28,7 +28,7 @@
 
 #include "dropout_gpu_emulator.hpp"
 
-#include <../test/rnn_util.hpp>
+#include <miopen_utils/rnn_util.hpp>
 
 #include <algorithm>
 #include <cassert>
diff --git a/projects/miopen/driver/rope_driver.hpp b/projects/miopen/driver/rope_driver.hpp
index bbad2370bf4e..27f0a03126ac 100644
--- a/projects/miopen/driver/rope_driver.hpp
+++ b/projects/miopen/driver/rope_driver.hpp
@@ -39,8 +39,8 @@
 #include <miopen/tensor.hpp>
 #include <numeric>
 #include <vector>
-#include <../test/tensor_holder.hpp>
-#include <../test/verify.hpp>
+#include <miopen_utils/tensor_holder.hpp>
+#include <miopen_utils/verify.hpp>
 
 template <typename Tgpu, typename Tcheck>
 int32_t mloRoPEForwardRunHost(miopenTensorDescriptor_t xDesc,
diff --git a/projects/miopen/driver/softmarginloss_driver.hpp b/projects/miopen/driver/softmarginloss_driver.hpp
index 3a6b095eaa0e..6589abd88db9 100644
--- a/projects/miopen/driver/softmarginloss_driver.hpp
+++ b/projects/miopen/driver/softmarginloss_driver.hpp
@@ -35,8 +35,8 @@
 #include <miopen/miopen.h>
 #include <miopen/tensor.hpp>
 #include <vector>
-#include <../test/tensor_holder.hpp>
-#include <../test/verify.hpp>
+#include <miopen_utils/tensor_holder.hpp>
+#include <miopen_utils/verify.hpp>
 #include <miopen/tensor_view_utils.hpp>
 
 template <typename Tgpu, typename Tcheck>
diff --git a/projects/miopen/driver/softmax_driver.hpp b/projects/miopen/driver/softmax_driver.hpp
index e147191b2deb..52f42fdfd5f8 100644
--- a/projects/miopen/driver/softmax_driver.hpp
+++ b/projects/miopen/driver/softmax_driver.hpp
@@ -11,7 +11,7 @@
 #include "timer.hpp"
 #include "util_driver.hpp"
 
-#include <../test/verify.hpp>
+#include <miopen_utils/verify.hpp>
 
 #include <miopen/miopen.h>
 #include <miopen/tensor.hpp>
diff --git a/projects/miopen/driver/t5layernorm_driver.hpp b/projects/miopen/driver/t5layernorm_driver.hpp
index c8517ad525d8..b57fe456403f 100644
--- a/projects/miopen/driver/t5layernorm_driver.hpp
+++ b/projects/miopen/driver/t5layernorm_driver.hpp
@@ -26,8 +26,8 @@
 #ifndef GUARD_MIOPEN_T5LAYERNORM_DRIVER_HPP
 #define GUARD_MIOPEN_T5LAYERNORM_DRIVER_HPP
 
-#include <../test/tensor_holder.hpp>
-#include <../test/verify.hpp>
+#include <miopen_utils/tensor_holder.hpp>
+#include <miopen_utils/verify.hpp>
 #include "InputFlags.hpp"
 #include "driver.hpp"
 #include "random.hpp"
diff --git a/projects/miopen/driver/transformers_adam_w_driver.hpp b/projects/miopen/driver/transformers_adam_w_driver.hpp
index dfd82a3284c6..a1cd81f2eb53 100644
--- a/projects/miopen/driver/transformers_adam_w_driver.hpp
+++ b/projects/miopen/driver/transformers_adam_w_driver.hpp
@@ -32,7 +32,7 @@
 #include "tensor_driver.hpp"
 #include "timer.hpp"
 
-#include "../test/verify.hpp"
+#include <miopen_utils/verify.hpp>
 
 #include <miopen/miopen.h>
 #include <miopen/tensor.hpp>
diff --git a/projects/miopen/miopen_utils/CMakeLists.txt b/projects/miopen/miopen_utils/CMakeLists.txt
new file mode 100644
index 000000000000..47e61c063411
--- /dev/null
+++ b/projects/miopen/miopen_utils/CMakeLists.txt
@@ -0,0 +1,38 @@
+################################################################################
+#
+# MIT License
+#
+# Copyright (c) 2025 Advanced Micro Devices, Inc.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+################################################################################
+
+# Utility library for MIOpen test/verification code shared by MIOpenDriver and tests.
+# Depends on common_utils and the MIOpen public API (miopen.h).
+# Phase 1: May still use MIOpen internal headers temporarily.
+
+add_library(miopen_utils INTERFACE)
+add_library(MIOpen::miopen_utils ALIAS miopen_utils)
+
+target_include_directories(miopen_utils INTERFACE
+    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
+)
+
+target_link_libraries(miopen_utils INTERFACE miopen_common_utils)
diff --git a/projects/miopen/miopen_utils/include/miopen_utils/cpu_bias.hpp b/projects/miopen/miopen_utils/include/miopen_utils/cpu_bias.hpp
new file mode 100644
index 000000000000..0125ca37d298
--- /dev/null
+++ b/projects/miopen/miopen_utils/include/miopen_utils/cpu_bias.hpp
@@ -0,0 +1,140 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2019 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef GUARD_CPU_BIAS_HPP
+#define GUARD_CPU_BIAS_HPP
+
+#include <array>
+#include <iostream>
+#include <iterator>
+#include <limits>
+#include <memory>
+#include <miopen/miopen.h>
+#include <miopen/tensor.hpp>
+#include <utility>
+
+#include <miopen_utils/tensor_holder.hpp>
+#include <miopen/stringutils.hpp>
+#include <miopen/functional.hpp>
+
+template <std::size_t NSpatialDim, typename Tout, typename Tbias>
+void cpu_bias_forward_impl(tensor<Tout>& out, const tensor<Tbias>& bias)
+{
+    assert(out.desc.GetNumDims() == NSpatialDim + 2 and bias.desc.GetNumDims() == NSpatialDim + 2);
+    assert(
+        bias.desc.GetLengths()[0] == 1 && bias.desc.GetLengths()[1] == out.desc.GetLengths()[1] &&
+        std::all_of(bias.desc.GetLengths().begin() + 2, bias.desc.GetLengths().end(), [](auto v) {
+            return v == 1;
+        }));
+
+    out.par_for_each([&](auto out_n_id, auto out_k_id, auto... out_spatial_id_pack) {
+        out(out_n_id, out_k_id, out_spatial_id_pack...) =
+            double(out(out_n_id, out_k_id, out_spatial_id_pack...)) + double(bias.data[out_k_id]);
+    });
+}
+
+template <std::size_t NSpatialDim, typename Tout, typename Tbias>
+void cpu_bias_backward_data_impl(const tensor<Tout>& out, tensor<Tbias>& bias)
+{
+    assert(out.desc.GetNumDims() == NSpatialDim + 2 and bias.desc.GetNumDims() == NSpatialDim + 2);
+    assert(
+        bias.desc.GetLengths()[0] == 1 && bias.desc.GetLengths()[1] == out.desc.GetLengths()[0] &&
+        std::all_of(bias.desc.GetLengths().begin() + 2, bias.desc.GetLengths().end(), [](auto v) {
+            return v == 1;
+        }));
+
+    std::size_t out_n_len = out.desc.GetLengths()[0];
+    std::size_t out_k_len = out.desc.GetLengths()[1];
+
+    std::array<std::size_t, NSpatialDim> out_spatial_len{};
+    std::copy_n(out.desc.GetLengths().begin() + 2, NSpatialDim, out_spatial_len.begin());
+
+    miopen::par_ford(out_k_len)([&](auto out_k_id) {
+        auto ford_out_n_spatial =
+            miopen::unpacker(miopen::prepender(miopen::ford, out_n_len))(out_spatial_len);
+
+        double acc = 0;
+        ford_out_n_spatial([&](auto out_n_id, auto... out_spatial_id_pack) {
+            acc += double(out(out_n_id, out_k_id, out_spatial_id_pack...));
+        });
+
+        bias.data[out_k_id] = acc;
+    });
+}
+
+template <typename Tout, typename Tbias>
+void cpu_bias_forward(tensor<Tout>& out, const tensor<Tbias>& bias)
+{
+    switch(out.desc.GetNumDims())
+    {
+    case 3: {
+        cpu_bias_forward_impl<1>(out, bias);
+        break;
+    }
+    case 4: {
+        cpu_bias_forward_impl<2>(out, bias);
+        break;
+    }
+    case 5: {
+        cpu_bias_forward_impl<3>(out, bias);
+        break;
+    }
+    case 6: {
+        cpu_bias_forward_impl<4>(out, bias);
+        break;
+    }
+    default: {
+        MIOPEN_THROW("not belong to any case");
+    }
+    }
+}
+
+template <typename Tout, typename Tbias>
+void cpu_bias_backward_data(const tensor<Tout>& out, tensor<Tbias>& bias)
+{
+    switch(out.desc.GetNumDims())
+    {
+    case 3: {
+        cpu_bias_backward_data_impl<1>(out, bias);
+        break;
+    }
+    case 4: {
+        cpu_bias_backward_data_impl<2>(out, bias);
+        break;
+    }
+    case 5: {
+        cpu_bias_backward_data_impl<3>(out, bias);
+        break;
+    }
+    case 6: {
+        cpu_bias_backward_data_impl<4>(out, bias);
+        break;
+    }
+    default: {
+        MIOPEN_THROW("not belong to any case");
+    }
+    }
+}
+#endif
diff --git a/projects/miopen/miopen_utils/include/miopen_utils/cpu_conv.hpp b/projects/miopen/miopen_utils/include/miopen_utils/cpu_conv.hpp
new file mode 100644
index 000000000000..2ef2c5b31236
--- /dev/null
+++ b/projects/miopen/miopen_utils/include/miopen_utils/cpu_conv.hpp
@@ -0,0 +1,514 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2019 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef GUARD_CPU_CONV_HPP
+#define GUARD_CPU_CONV_HPP
+
+#include <array>
+#include <iostream>
+#include <iterator>
+#include <limits>
+#include <memory>
+#include <miopen/miopen.h>
+#include <miopen/tensor.hpp>
+#include <utility>
+
+#include <miopen_utils/tensor_holder.hpp>
+#include <miopen/stringutils.hpp>
+#include <miopen/functional.hpp>
+#include <hip_float8.hpp>
+
+template <class T, class... Ts>
+static constexpr auto make_array(T x, Ts... xs)
+{
+    return std::array<T, 1 + sizeof...(Ts)>{{x, xs...}};
+}
+
+template <typename T>
+struct PassThru
+{
+    T operator()(T t) { return t; }
+};
+
+template <typename Tin, typename Twei, typename Tout>
+struct cpu_convolution_acc_type
+{
+    using type = double; // default using double as accumulator
+};
+
+template <>
+struct cpu_convolution_acc_type<int8_t, int8_t, int32_t>
+{
+    using type = int32_t;
+};
+
+template <>
+struct cpu_convolution_acc_type<int8_t, int8_t, float>
+{
+    using type = double;
+};
+
+template <std::size_t ConvDim,
+          typename Tacc,
+          typename FI,
+          typename FW,
+          typename Tin,
+          typename Twei,
+          typename Tout,
+          typename Range>
+void cpu_convolution_forward_impl(const tensor<Tin>& in,
+                                  const tensor<Twei>& wei,
+                                  tensor<Tout>& out,
+                                  const Range& pads,
+                                  const Range& strides,
+                                  const Range& dilations,
+                                  std::size_t group_count,
+                                  FI fi = {},
+                                  FW fw = {})
+{
+    static_assert(ConvDim > 0, "wrong! convolution dim should be larger than 0");
+    assert(in.desc.GetNumDims() == ConvDim + 2 and wei.desc.GetNumDims() == ConvDim + 2 and
+           out.desc.GetNumDims() == ConvDim + 2 and pads.size() == ConvDim and
+           strides.size() == ConvDim and dilations.size() == ConvDim);
+    std::size_t out_n_len = out.desc.GetLengths()[0];
+
+    std::size_t wei_k_len = wei.desc.GetLengths()[0];
+    std::size_t wei_c_len = wei.desc.GetLengths()[1];
+
+    std::size_t vector_len = in.desc.GetVectorLength();
+
+    std::array<std::size_t, ConvDim> in_spatial_len{};
+    std::array<std::size_t, ConvDim> wei_spatial_len{};
+    std::array<std::size_t, ConvDim> out_spatial_len{};
+
+    std::copy_n(in.desc.GetLengths().begin() + 2, ConvDim, in_spatial_len.begin());
+    std::copy_n(wei.desc.GetLengths().begin() + 2, ConvDim, wei_spatial_len.begin());
+    std::copy_n(out.desc.GetLengths().begin() + 2, ConvDim, out_spatial_len.begin());
+
+    if(wei.desc.GetLayout_str() == "CHWNc")
+    {
+        wei_c_len = wei.desc.GetLengths()[0];
+        std::copy_n(wei.desc.GetLengths().begin() + 1, ConvDim, wei_spatial_len.begin());
+        wei_k_len = wei.desc.GetLengths()[3];
+    }
+
+    std::size_t wei_k_len_per_group = wei_k_len / group_count;
+
+    // f(x0, x1, xs...)
+    // f1(xs...) = f(x0, x1, xs...)
+    // f2(xs_array) = f1(xs...)
+    auto par_ford_out_nk_spatial = miopen::unpacker(
+        miopen::prepender(miopen::par_ford, out_n_len, wei_k_len))(out_spatial_len);
+
+    par_ford_out_nk_spatial([&](std::size_t out_n_id,
+                                std::size_t out_k_id,
+                                auto... out_spatial_id_pack) {
+        auto out_spatial_id = make_array(out_spatial_id_pack...);
+
+        std::size_t group_id = out_k_id / wei_k_len_per_group;
+        Tacc acc             = 0;
+
+        miopen::ford(wei_c_len)([&](std::size_t wei_c_id) {
+            std::size_t in_c_id = group_id * wei_c_len + wei_c_id;
+
+            auto ford_wei_spatial = miopen::unpacker(miopen::ford)(wei_spatial_len);
+
+            ford_wei_spatial([&](auto... wei_spatial_id_pack) {
+                auto wei_spatial_id = make_array(wei_spatial_id_pack...);
+
+                std::array<std::ptrdiff_t, ConvDim> in_spatial_id{};
+
+                for(std::size_t i = 0; i < ConvDim; ++i)
+                {
+                    in_spatial_id[i] =
+                        out_spatial_id[i] * strides[i] + wei_spatial_id[i] * dilations[i] - pads[i];
+                }
+                bool out_of_bound = false;
+                for(std::size_t i = 0; i < ConvDim; ++i)
+                {
+                    out_of_bound = out_of_bound or
+                                   (in_spatial_id[i] < 0 or in_spatial_id[i] >= in_spatial_len[i]);
+                }
+                if(!out_of_bound)
+                {
+                    if(vector_len > 1)
+                    {
+                        std::array<std::size_t, ConvDim + 3> in_id{};
+                        in_id[1] = out_n_id;
+                        in_id[2] = in_c_id;
+                        std::copy_n(in_spatial_id.begin(), ConvDim, in_id.begin() + 3);
+                        for(std::size_t i = 0; i < vector_len; i++)
+                        {
+                            in_id[0] = i;
+                            acc += Tacc(in(in_id)) *
+                                   Tacc(wei(i, out_k_id, wei_c_id, wei_spatial_id_pack...));
+                        }
+                    }
+                    else
+                    {
+                        std::array<std::size_t, ConvDim + 2> in_id{};
+                        in_id[0] = out_n_id;
+                        in_id[1] = in_c_id;
+                        std::copy_n(in_spatial_id.begin(), ConvDim, in_id.begin() + 2);
+                        Tacc tmp1 = static_cast<Tacc>(fi(in(in_id)));
+                        Tacc tmp2 =
+                            static_cast<Tacc>(fw(wei(out_k_id, wei_c_id, wei_spatial_id_pack...)));
+                        acc += tmp1 * tmp2;
+                    }
+                }
+            });
+        });
+        if(vector_len > 1)
+        {
+            out(out_k_id % vector_len, out_n_id, out_k_id / vector_len, out_spatial_id_pack...) =
+                static_cast<Tout>(acc);
+        }
+        else
+        {
+            out(out_n_id, out_k_id, out_spatial_id_pack...) = static_cast<Tout>(acc);
+        }
+    });
+}
+
+template <std::size_t ConvDim,
+          typename Tacc,
+          typename FW,
+          typename FO,
+          typename Tin,
+          typename Twei,
+          typename Tout,
+          typename Range>
+void cpu_convolution_backward_data_impl(tensor<Tin>& in,
+                                        const tensor<Twei>& wei,
+                                        const tensor<Tout>& out,
+                                        const Range& pads,
+                                        const Range& strides,
+                                        const Range& dilations,
+                                        std::size_t group_count,
+                                        FW fw = {},
+                                        FO fo = {})
+{
+    static_assert(ConvDim > 0, "wrong! convolution dim should be larger than 0");
+    assert(in.desc.GetNumDims() == ConvDim + 2 and wei.desc.GetNumDims() == ConvDim + 2 and
+           out.desc.GetNumDims() == ConvDim + 2 and pads.size() == ConvDim and
+           strides.size() == ConvDim and dilations.size() == ConvDim);
+
+    std::size_t in_n_len = in.desc.GetLengths()[0];
+    std::size_t in_c_len = in.desc.GetLengths()[1];
+
+    std::size_t wei_k_len = wei.desc.GetLengths()[0];
+    std::size_t wei_c_len = wei.desc.GetLengths()[1];
+
+    std::size_t wei_k_len_per_group = wei_k_len / group_count;
+
+    std::array<std::size_t, ConvDim> in_spatial_len{};
+    std::array<std::size_t, ConvDim> wei_spatial_len{};
+    std::array<std::size_t, ConvDim> out_spatial_len{};
+
+    std::copy_n(in.desc.GetLengths().begin() + 2, ConvDim, in_spatial_len.begin());
+    std::copy_n(wei.desc.GetLengths().begin() + 2, ConvDim, wei_spatial_len.begin());
+    std::copy_n(out.desc.GetLengths().begin() + 2, ConvDim, out_spatial_len.begin());
+
+    auto par_ford_in_nc_spatial =
+        miopen::unpacker(miopen::prepender(miopen::par_ford, in_n_len, in_c_len))(in_spatial_len);
+
+    par_ford_in_nc_spatial(
+        [&](std::size_t in_n_id, std::size_t in_c_id, auto... in_spatial_id_pack) {
+            auto in_spatial_id = make_array(in_spatial_id_pack...);
+
+            std::size_t group_id = in_c_id / wei_c_len;
+
+            Tacc acc = 0;
+
+            miopen::ford(wei_k_len_per_group)([&](std::size_t wei_k_id_inside_group) {
+                auto ford_wei_spatial = miopen::unpacker(miopen::ford)(wei_spatial_len);
+
+                ford_wei_spatial([&](auto... wei_spatial_id_pack) {
+                    auto wei_spatial_id = make_array(wei_spatial_id_pack...);
+
+                    std::array<ptrdiff_t, ConvDim> out_spatial_id_{};
+                    std::array<ptrdiff_t, ConvDim> out_spatial_id{};
+
+                    for(std::size_t i = 0; i < ConvDim; ++i)
+                    {
+                        out_spatial_id_[i] =
+                            pads[i] + in_spatial_id[i] - wei_spatial_id[i] * dilations[i];
+                        out_spatial_id[i] = out_spatial_id_[i] / strides[i];
+                    }
+
+                    bool use = true;
+                    for(std::size_t i = 0; i < ConvDim; ++i)
+                    {
+                        use &= out_spatial_id_[i] % strides[i] == 0 and out_spatial_id[i] >= 0 and
+                               out_spatial_id[i] < out_spatial_len[i];
+                    }
+
+                    if(use)
+                    {
+                        std::size_t out_k_id =
+                            group_id * wei_k_len_per_group + wei_k_id_inside_group;
+                        std::size_t wei_c_id = in_c_id % wei_c_len;
+
+                        std::array<std::size_t, ConvDim + 2> out_id{};
+                        out_id[0] = in_n_id;
+                        out_id[1] = out_k_id;
+                        std::copy_n(out_spatial_id.begin(), ConvDim, out_id.begin() + 2);
+                        Tacc tmp1 = fo(out(out_id));
+                        Tacc tmp2 = fw(wei(out_k_id, wei_c_id, wei_spatial_id_pack...));
+                        acc += tmp1 * tmp2;
+                    }
+                });
+            });
+            // TODO: Why do we need a no-lint here ?
+            in(in_n_id, in_c_id, in_spatial_id_pack...) = static_cast<Tout>(acc); // NOLINT
+        });
+}
+
+template <std::size_t ConvDim,
+          typename Tacc,
+          typename FI,
+          typename FO,
+          typename Tin,
+          typename Twei,
+          typename Tout,
+          typename Range>
+void cpu_convolution_backward_weight_impl(const tensor<Tin>& in,
+                                          tensor<Twei>& wei,
+                                          const tensor<Tout>& out,
+                                          const Range& pads,
+                                          const Range& strides,
+                                          const Range& dilations,
+                                          std::size_t group_count,
+                                          FI fi,
+                                          FO fo)
+{
+    static_assert(ConvDim > 0, "wrong! convolution dim should be larger than 0");
+    assert(in.desc.GetNumDims() == ConvDim + 2 and wei.desc.GetNumDims() == ConvDim + 2 and
+           out.desc.GetNumDims() == ConvDim + 2 and pads.size() == ConvDim and
+           strides.size() == ConvDim and dilations.size() == ConvDim);
+
+    std::size_t out_n_len = out.desc.GetLengths()[0];
+
+    std::size_t wei_k_len = wei.desc.GetLengths()[0];
+    std::size_t wei_c_len = wei.desc.GetLengths()[1];
+
+    std::size_t wei_k_len_per_group = wei_k_len / group_count;
+
+    std::array<std::size_t, ConvDim> in_spatial_len{};
+    std::array<std::size_t, ConvDim> wei_spatial_len{};
+    std::array<std::size_t, ConvDim> out_spatial_len{};
+
+    std::copy_n(in.desc.GetLengths().begin() + 2, ConvDim, in_spatial_len.begin());
+    std::copy_n(wei.desc.GetLengths().begin() + 2, ConvDim, wei_spatial_len.begin());
+    std::copy_n(out.desc.GetLengths().begin() + 2, ConvDim, out_spatial_len.begin());
+
+    auto par_ford_wei_kc_spatial = miopen::unpacker(
+        miopen::prepender(miopen::par_ford, wei_k_len, wei_c_len))(wei_spatial_len);
+
+    par_ford_wei_kc_spatial(
+        [&](std::size_t wei_k_id, std::size_t wei_c_id, auto... wei_spatial_id_pack) {
+            auto wei_spatial_id = make_array(wei_spatial_id_pack...);
+
+            std::size_t group_id = wei_k_id / wei_k_len_per_group;
+            std::size_t in_c_id  = group_id * wei_c_len + wei_c_id;
+
+            Tacc acc = 0;
+
+            miopen::ford(out_n_len)([&](std::size_t out_n_id) {
+                auto ford_out_spatial = miopen::unpacker(miopen::ford)(out_spatial_len);
+
+                ford_out_spatial([&](auto... out_spatial_id_pack) {
+                    auto out_spatial_id = make_array(out_spatial_id_pack...);
+
+                    std::array<std::ptrdiff_t, ConvDim> in_spatial_id{};
+
+                    for(std::size_t i = 0; i < ConvDim; ++i)
+                    {
+                        in_spatial_id[i] = out_spatial_id[i] * strides[i] +
+                                           wei_spatial_id[i] * dilations[i] - pads[i];
+                    }
+
+                    bool out_of_bound = false;
+                    for(std::size_t i = 0; i < ConvDim; ++i)
+                    {
+                        out_of_bound = out_of_bound or (in_spatial_id[i] < 0 or
+                                                        in_spatial_id[i] >= in_spatial_len[i]);
+                    }
+
+                    if(!out_of_bound)
+                    {
+                        std::array<std::size_t, ConvDim + 2> in_id{};
+                        in_id[0] = out_n_id;
+                        in_id[1] = in_c_id;
+                        std::copy_n(in_spatial_id.begin(), ConvDim, in_id.begin() + 2);
+                        Tacc tmp1 = fi(in(in_id));
+                        Tacc tmp2 = fo(out(out_n_id, wei_k_id, out_spatial_id_pack...));
+                        acc += tmp1 * tmp2;
+                    }
+                });
+
+                wei(wei_k_id, wei_c_id, wei_spatial_id_pack...) = static_cast<Twei>(acc);
+            });
+        });
+}
+
+template <typename Tin,
+          typename Twei,
+          typename Tout,
+          typename Range,
+          typename Tacc = double,
+          typename FI   = PassThru<Tin>,
+          typename FW   = PassThru<Twei>>
+void cpu_convolution_forward(std::size_t spatial_dim,
+                             const tensor<Tin>& in,
+                             const tensor<Twei>& wei,
+                             tensor<Tout>& out,
+                             const Range& pads,
+                             const Range& strides,
+                             const Range& dilations,
+                             std::size_t group_count,
+                             FI fi = {},
+                             FW fw = {})
+{
+    switch(spatial_dim)
+    {
+    case 1: {
+        cpu_convolution_forward_impl<1, Tacc>(
+            in, wei, out, pads, strides, dilations, group_count, fi, fw);
+        break;
+    }
+    case 2: {
+        cpu_convolution_forward_impl<2, Tacc>(
+            in, wei, out, pads, strides, dilations, group_count, fi, fw);
+        break;
+    }
+    case 3: {
+        cpu_convolution_forward_impl<3, Tacc>(
+            in, wei, out, pads, strides, dilations, group_count, fi, fw);
+        break;
+    }
+    case 4: {
+        cpu_convolution_forward_impl<4, Tacc>(
+            in, wei, out, pads, strides, dilations, group_count, fi, fw);
+        break;
+    }
+    default: {
+        MIOPEN_THROW("not belong to any case");
+    }
+    }
+}
+
+template <typename Tin,
+          typename Twei,
+          typename Tout,
+          typename Range,
+          typename Tacc = double,
+          typename FW   = PassThru<Twei>,
+          typename FO   = PassThru<Tout>>
+void cpu_convolution_backward_data(std::size_t spatial_dim,
+                                   tensor<Tin>& in,
+                                   const tensor<Twei>& wei,
+                                   const tensor<Tout>& out,
+                                   const Range& pads,
+                                   const Range& strides,
+                                   const Range& dilations,
+                                   std::size_t group_count,
+                                   FW fw = {},
+                                   FO fo = {})
+{
+    switch(spatial_dim)
+    {
+    case 1: {
+        cpu_convolution_backward_data_impl<1, Tacc>(
+            in, wei, out, pads, strides, dilations, group_count, fw, fo);
+        break;
+    }
+    case 2: {
+        cpu_convolution_backward_data_impl<2, Tacc>(
+            in, wei, out, pads, strides, dilations, group_count, fw, fo);
+        break;
+    }
+    case 3: {
+        cpu_convolution_backward_data_impl<3, Tacc>(
+            in, wei, out, pads, strides, dilations, group_count, fw, fo);
+        break;
+    }
+    case 4: {
+        cpu_convolution_backward_data_impl<4, Tacc>(
+            in, wei, out, pads, strides, dilations, group_count, fw, fo);
+        break;
+    }
+    default: {
+        MIOPEN_THROW("not belong to any case");
+    }
+    }
+}
+
+template <typename Tin,
+          typename Twei,
+          typename Tout,
+          typename Range,
+          typename Tacc = double,
+          typename FI   = PassThru<Tin>,
+          typename FO   = PassThru<Tout>>
+void cpu_convolution_backward_weight(std::size_t spatial_dim,
+                                     const tensor<Tin>& in,
+                                     tensor<Twei>& wei,
+                                     const tensor<Tout>& out,
+                                     const Range& pads,
+                                     const Range& strides,
+                                     const Range& dilations,
+                                     std::size_t group_count,
+                                     FI fi = {},
+                                     FO fo = {})
+{
+    switch(spatial_dim)
+    {
+    case 1: {
+        cpu_convolution_backward_weight_impl<1, Tacc>(
+            in, wei, out, pads, strides, dilations, group_count, fi, fo);
+        break;
+    }
+    case 2: {
+        cpu_convolution_backward_weight_impl<2, Tacc>(
+            in, wei, out, pads, strides, dilations, group_count, fi, fo);
+        break;
+    }
+    case 3: {
+        cpu_convolution_backward_weight_impl<3, Tacc>(
+            in, wei, out, pads, strides, dilations, group_count, fi, fo);
+        break;
+    }
+    case 4: {
+        cpu_convolution_backward_weight_impl<4, Tacc>(
+            in, wei, out, pads, strides, dilations, group_count, fi, fo);
+        break;
+    }
+    default: {
+        MIOPEN_THROW("not belong to any case");
+    }
+    }
+}
+#endif
diff --git a/projects/miopen/miopen_utils/include/miopen_utils/cpu_layernorm.hpp b/projects/miopen/miopen_utils/include/miopen_utils/cpu_layernorm.hpp
new file mode 100644
index 000000000000..0a6ab5556865
--- /dev/null
+++ b/projects/miopen/miopen_utils/include/miopen_utils/cpu_layernorm.hpp
@@ -0,0 +1,216 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+#ifndef GUARD_CPU_CONV_HPP
+#define GUARD_CPU_CONV_HPP
+
+#include <miopen_utils/tensor_holder.hpp>
+
+template <class T>
+void cpu_layernorm_forward(tensor<T> input,
+                           tensor<T> weight,
+                           tensor<T> bias,
+                           tensor<T>& ref_output,
+                           tensor<T>& ref_mean,
+                           tensor<T>& ref_rstd,
+                           float eps,
+                           int32_t dim,
+                           miopenNormMode_t mode,
+                           bool use_multithread = false)
+{
+    auto layout   = input.desc.GetLayoutEnum();
+    size_t stride = 1;
+    if(dim > 1 && layout.has_value() &&
+       (layout.value() == miopenTensorNHWC || layout.value() == miopenTensorNDHWC))
+    {
+        stride = input.desc.GetLengths()[1]; // stride = C
+    }
+
+    auto dims         = input.desc.GetLengths();
+    size_t outer_size = 1;
+    size_t inner_size = 1;
+    for(size_t i = 0; i < dims.size(); ++i)
+    {
+        if(i < dim)
+        {
+            if(!(stride > 1 && i == 1))
+            {
+                outer_size *= dims[i];
+            }
+        }
+        else
+        {
+            inner_size *= dims[i];
+        }
+    }
+
+    size_t min_grain = use_multithread ? 8 : outer_size;
+    miopen::par_for(outer_size, min_grain, [&](int32_t o) {
+        miopen::ford(stride)([&](int32_t s) {
+            double mean_v = 0.0;
+            double var_v  = 0.0;
+
+            miopen::ford(inner_size)([&](int32_t i) {
+                double tmp = static_cast<double>(input[o * inner_size * stride + i * stride + s]);
+                mean_v += tmp;
+                var_v += tmp * tmp;
+            });
+
+            mean_v        = mean_v / inner_size;
+            var_v         = var_v / inner_size - mean_v * mean_v;
+            double rstd_v = 1.0 / sqrt(var_v + eps);
+
+            ref_mean[o * stride + s] = static_cast<T>(mean_v);
+            ref_rstd[o * stride + s] = static_cast<T>(rstd_v);
+
+            miopen::ford(inner_size)([&](int32_t i) {
+                double weight_v =
+                    (mode == MIOPEN_ELEMENTWISE_AFFINE) ? 1.0 : static_cast<float>(weight[i]);
+                double bias_v =
+                    (mode == MIOPEN_ELEMENTWISE_AFFINE) ? 0.0 : static_cast<float>(bias[i]);
+
+                ref_output[o * inner_size * stride + i * stride + s] = static_cast<T>(
+                    (static_cast<double>(input[o * inner_size * stride + i * stride + s]) -
+                     mean_v) *
+                        rstd_v * weight_v +
+                    bias_v);
+            });
+        });
+    });
+}
+
+template <class T>
+void cpu_layernorm_backward(tensor<T> dy,
+                            tensor<T> x,
+                            tensor<T> weight,
+                            tensor<T> mean,
+                            tensor<T> rstd,
+                            tensor<T>& ref_dx,
+                            int32_t dim,
+                            miopenNormMode_t mode,
+                            bool use_multithread = false)
+{
+    auto layout   = dy.desc.GetLayoutEnum();
+    size_t stride = 1;
+    if(dim > 1 && (layout == miopenTensorNHWC || layout == miopenTensorNDHWC))
+    {
+        stride = dy.desc.GetLengths()[1]; // stride = C
+    }
+
+    auto dims         = dy.desc.GetLengths();
+    size_t outer_size = 1;
+    size_t inner_size = 1;
+    for(size_t i = 0; i < dims.size(); ++i)
+    {
+        if(i < dim)
+        {
+            if(!(stride > 1 && i == 1))
+            {
+                outer_size *= dims[i];
+            }
+        }
+        else
+        {
+            inner_size *= dims[i];
+        }
+    }
+
+    size_t min_grain = use_multithread ? 8 : outer_size;
+    miopen::par_for(outer_size, min_grain, [&](int32_t o) {
+        miopen::ford(stride)([&](int32_t s) {
+            double sum_dy_weight   = 0.0;
+            double sum_dy_weight_x = 0.0;
+
+            miopen::ford(inner_size)([&](int32_t i) {
+                double pweight =
+                    (mode == MIOPEN_ELEMENTWISE_AFFINE) ? 1.0 : static_cast<double>(weight[i]);
+                double pdy = (dy.GetSize() != 0)
+                                 ? static_cast<double>(dy[o * inner_size * stride + i * stride + s])
+                                 : 0.0;
+                double px  = static_cast<double>(x[o * inner_size * stride + i * stride + s]);
+
+                sum_dy_weight += pdy * pweight;
+                sum_dy_weight_x += pdy * px * pweight;
+            });
+
+            double scale = 1.0 / static_cast<double>(inner_size);
+            double prstd = static_cast<double>(rstd[o * stride + s]);
+            double pmean = static_cast<double>(mean[o * stride + s]);
+            double a = prstd * prstd * prstd * scale * (sum_dy_weight_x - sum_dy_weight * pmean);
+            double b = prstd * sum_dy_weight * scale - a * pmean;
+
+            miopen::ford(inner_size)([&](int32_t i) {
+                double pweight =
+                    (mode == MIOPEN_ELEMENTWISE_AFFINE) ? 1.0 : static_cast<double>(weight[i]);
+                double pdy = (dy.GetSize() != 0)
+                                 ? static_cast<double>(dy[o * inner_size * stride + i * stride + s])
+                                 : 0.0;
+                double val = prstd * pdy * pweight -
+                             a * static_cast<double>(x[o * inner_size * stride + i * stride + s]) -
+                             b;
+
+                ref_dx[o * inner_size * stride + i * stride + s] = static_cast<T>(val);
+            });
+        });
+    });
+}
+
+template <class T>
+void cpu_layernorm_backward_weight_bias(tensor<T> dy,
+                                        tensor<T> x,
+                                        tensor<T> mean,
+                                        tensor<T> rstd,
+                                        tensor<T>& ref_dw,
+                                        tensor<T>& ref_db,
+                                        int32_t dim,
+                                        bool use_multithread = false)
+{
+    auto layout   = dy.desc.GetLayoutEnum();
+    size_t stride = 1;
+    if(dim > 1 && (layout == miopenTensorNHWC || layout == miopenTensorNDHWC))
+    {
+        stride = dy.desc.GetLengths()[1]; // stride = C
+    }
+
+    auto dims         = dy.desc.GetLengths();
+    size_t outer_size = 1;
+    size_t inner_size = 1;
+    for(size_t i = 0; i < dims.size(); ++i)
+    {
+        if(i < dim)
+        {
+            if(!(stride > 1 && i == 1))
+            {
+                outer_size *= dims[i];
+            }
+        }
+        else
+        {
+            inner_size *= dims[i];
+        }
+    }
+
+    size_t min_grain = use_multithread ? 8 : inner_size;
+    miopen::par_for(inner_size, min_grain, [&](int32_t i) {
+        double sum_dw = 0.0;
+        double sum_db = 0.0;
+
+        miopen::ford(stride)([&](int32_t s) {
+            miopen::ford(outer_size)([&](int32_t o) {
+                double prstd = static_cast<double>(rstd[o * stride + s]);
+                double pmean = static_cast<double>(mean[o * stride + s]);
+                double pdy   = (dy.GetSize() != 0)
+                                   ? static_cast<double>(dy[o * inner_size * stride + i * stride + s])
+                                   : 0;
+                double px    = static_cast<double>(x[o * inner_size * stride + i * stride + s]);
+
+                sum_dw += pdy * (px - pmean) * prstd;
+                sum_db += pdy;
+            });
+        });
+
+        ref_dw[i] = sum_dw;
+        ref_db[i] = sum_db;
+    });
+}
+
+#endif
diff --git a/projects/miopen/miopen_utils/include/miopen_utils/cpu_reduce_util.hpp b/projects/miopen/miopen_utils/include/miopen_utils/cpu_reduce_util.hpp
new file mode 100644
index 000000000000..e5f7d50f9d0b
--- /dev/null
+++ b/projects/miopen/miopen_utils/include/miopen_utils/cpu_reduce_util.hpp
@@ -0,0 +1,649 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2020 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef GUARD_CPU_REDUCE_UTIL_HPP
+#define GUARD_CPU_REDUCE_UTIL_HPP
+
+#include "miopen/reducetensor.hpp"
+#include <miopen_utils/tensor_holder.hpp>
+#include <cstddef>
+#include <half/half.hpp>
+#include <limits>
+#include <cmath>
+#include <cassert>
+#include <ratio>
+#include <stdexcept>
+#include <string>
+#include <miopen/miopen.h>
+#include <miopen/reduce_common.hpp>
+
+namespace reduce {
+
+template <typename T>
+static inline bool float_equal_one(T);
+
+static inline bool float_equal_one(float x) { return x == 1.0f; };
+
+static inline bool float_equal_one(double x) { return x == 1.0; };
+
+static inline bool float_equal_one(half_float::half x)
+{
+    return x == convert_type<half_float::half>(1.0f);
+};
+
+template <typename T>
+static inline bool float_equal_zero(T x);
+
+static inline bool float_equal_zero(float x) { return x == 0.0f; };
+
+static inline bool float_equal_zero(double x) { return x == 0.0; };
+
+static inline bool float_equal_zero(half_float::half x)
+{
+    return x == convert_type<half_float::half>(0.0f);
+};
+
+template <typename SizeT>
+static inline void build_radix(const std::vector<SizeT>& lens, std::vector<std::size_t>& radix)
+{
+    const std::size_t D = lens.size();
+    radix.assign(D, 1);
+    for(std::size_t d = D; d-- > 1;)
+        radix[d - 1] = radix[d] * static_cast<std::size_t>(lens[d]); // radix[d] = Π_{k>d} lens[k]
+}
+
+// i -> memory offset using lens-radix + actual strides
+template <typename SizeT>
+static inline std::size_t linear_to_offset_by_lens_strides(std::size_t i,
+                                                           const std::vector<SizeT>& lens,
+                                                           const std::vector<std::size_t>& radix,
+                                                           const std::vector<SizeT>& strides)
+{
+    std::size_t off = 0;
+    for(std::size_t d = 0; d < lens.size(); ++d)
+    {
+        const std::size_t idx_d = (i / radix[d]) % static_cast<std::size_t>(lens[d]);
+        off += idx_d * static_cast<std::size_t>(strides[d]);
+    }
+    return off;
+}
+
+template <typename compType>
+static inline std::function<void(compType&)> PreUnaryOpFn(miopenReduceTensorOp_t op_, std::size_t)
+{
+    using std::abs;
+
+    switch(op_)
+    {
+    case MIOPEN_REDUCE_TENSOR_NORM1: return ([&](compType& a_) { a_ = abs(a_); });
+    case MIOPEN_REDUCE_TENSOR_NORM2: return ([&](compType& a_) { a_ = a_ * a_; });
+    case MIOPEN_REDUCE_TENSOR_AMAX: return ([&](compType& a_) { a_ = abs(a_); });
+
+    case MIOPEN_REDUCE_TENSOR_AVG:
+    case MIOPEN_REDUCE_TENSOR_ADD:
+    case MIOPEN_REDUCE_TENSOR_MUL:
+    case MIOPEN_REDUCE_TENSOR_MIN:
+    case MIOPEN_REDUCE_TENSOR_MAX: return ([&](compType&) {});
+    }
+
+    throw std::runtime_error(std::string(__FUNCTION__) +
+                             ": using undefined Reduction operation is not permitted");
+};
+
+template <typename compType>
+static inline std::function<void(compType&)> PosUnaryOpFn(miopenReduceTensorOp_t op_,
+                                                          std::size_t divider)
+{
+    using std::sqrt;
+
+    switch(op_)
+    {
+    case MIOPEN_REDUCE_TENSOR_NORM2: return ([&](compType& a_) { a_ = sqrt(a_); });
+
+    case MIOPEN_REDUCE_TENSOR_AVG:
+        return ([&, divider](compType& a_) {
+            a_ = a_ / convert_type<compType>(static_cast<float>(divider));
+        });
+
+    case MIOPEN_REDUCE_TENSOR_ADD:
+    case MIOPEN_REDUCE_TENSOR_NORM1:
+    case MIOPEN_REDUCE_TENSOR_MUL:
+    case MIOPEN_REDUCE_TENSOR_MIN:
+    case MIOPEN_REDUCE_TENSOR_MAX:
+    case MIOPEN_REDUCE_TENSOR_AMAX: return ([&](compType&) {});
+    }
+
+    throw std::runtime_error(std::string(__FUNCTION__) +
+                             ": using undefined Reduction operation is not permitted");
+};
+
+template <typename compType>
+static inline std::function<void(compType&, compType)> ReduceOpFn(miopenReduceTensorOp_t op_)
+{
+    switch(op_)
+    {
+    case MIOPEN_REDUCE_TENSOR_ADD:
+    case MIOPEN_REDUCE_TENSOR_AVG:
+    case MIOPEN_REDUCE_TENSOR_NORM1:
+    case MIOPEN_REDUCE_TENSOR_NORM2: return ([&](compType& a_, compType b_) { a_ = a_ + b_; });
+
+    case MIOPEN_REDUCE_TENSOR_MUL: return ([&](compType& a_, compType b_) { a_ = a_ * b_; });
+
+    case MIOPEN_REDUCE_TENSOR_MIN:
+        return ([&](compType& a_, compType b_) {
+            if(a_ > b_)
+                a_ = b_;
+        });
+
+    case MIOPEN_REDUCE_TENSOR_MAX:
+    case MIOPEN_REDUCE_TENSOR_AMAX:
+        return ([&](compType& a_, compType b_) {
+            if(a_ < b_)
+                a_ = b_;
+        });
+    }
+
+    throw std::runtime_error(std::string(__FUNCTION__) +
+                             ": using undefined Reduction operation is not permitted");
+};
+
+template <typename compType>
+static inline std::function<void(compType&, compType, bool& changed)>
+ReduceOpFn2(miopenReduceTensorOp_t op_)
+{
+    switch(op_)
+    {
+    case MIOPEN_REDUCE_TENSOR_MIN:
+        return ([&](compType& a_, compType b_, bool& changed) {
+            if(a_ > b_)
+            {
+                a_      = b_;
+                changed = true;
+            }
+            else
+            {
+                changed = false;
+            }
+        });
+
+    case MIOPEN_REDUCE_TENSOR_MAX:
+    case MIOPEN_REDUCE_TENSOR_AMAX:
+        return ([&](compType& a_, compType b_, bool& changed) {
+            if(a_ < b_)
+            {
+                a_      = b_;
+                changed = true;
+            }
+            else
+            {
+                changed = false;
+            }
+        });
+
+    case MIOPEN_REDUCE_TENSOR_ADD:
+    case MIOPEN_REDUCE_TENSOR_MUL:
+    case MIOPEN_REDUCE_TENSOR_AVG:
+    case MIOPEN_REDUCE_TENSOR_NORM1:
+    case MIOPEN_REDUCE_TENSOR_NORM2: return (std::function<void(compType&, compType, bool&)>{});
+    };
+
+    throw std::runtime_error(std::string(__FUNCTION__) +
+                             ": using undefined Reduction operation is not permitted");
+};
+
+template <typename compType>
+static inline compType ReduceOpZeroVal(miopenReduceTensorOp_t op_)
+{
+    switch(op_)
+    {
+    case MIOPEN_REDUCE_TENSOR_ADD:
+    case MIOPEN_REDUCE_TENSOR_AVG:
+    case MIOPEN_REDUCE_TENSOR_NORM1:
+    case MIOPEN_REDUCE_TENSOR_NORM2: return (convert_type<compType>(0.0f));
+
+    case MIOPEN_REDUCE_TENSOR_MUL: return (convert_type<compType>(1.0f));
+
+    case MIOPEN_REDUCE_TENSOR_MIN: return (std::numeric_limits<compType>::max());
+
+    case MIOPEN_REDUCE_TENSOR_MAX: return (std::numeric_limits<compType>::lowest());
+    case MIOPEN_REDUCE_TENSOR_AMAX: return (convert_type<compType>(0.0f));
+    }
+
+    throw std::runtime_error(std::string(__FUNCTION__) +
+                             ": using undefined Reduction operation is not permitted");
+};
+
+template <typename compType, typename reduceOpT>
+static inline void binop_with_nan_check(miopenNanPropagation_t nanOpt,
+                                        reduceOpT&& opReduce,
+                                        compType& accuVal,
+                                        compType currVal)
+{
+    using std::isnan;
+
+    if(nanOpt == MIOPEN_NOT_PROPAGATE_NAN)
+    {
+        opReduce(accuVal, currVal);
+    }
+    else
+    {
+        if(isnan(currVal))
+            accuVal = currVal;
+        else
+            opReduce(accuVal, currVal);
+    };
+};
+
+template <typename compType, typename reduceOpT>
+static inline void binop_with_nan_check2(miopenNanPropagation_t nanOpt,
+                                         reduceOpT&& opReduce,
+                                         compType& accuVal,
+                                         compType currVal,
+                                         int& accuIndex,
+                                         int currIndex)
+{
+    using std::isnan;
+
+    if(nanOpt == MIOPEN_NOT_PROPAGATE_NAN)
+    {
+        bool changed;
+
+        opReduce(accuVal, currVal, changed);
+
+        if(changed)
+            accuIndex = currIndex;
+    }
+    else
+    {
+        if(isnan(currVal))
+        {
+            accuVal   = currVal;
+            accuIndex = currIndex;
+        }
+        else
+        {
+            bool changed;
+
+            opReduce(accuVal, currVal, changed);
+
+            if(changed)
+                accuIndex = currIndex;
+        };
+    };
+};
+
+}; // end of namespace reduce
+
+template <typename T>
+std::vector<std::vector<T>> get_all_indexes(const std::vector<T>& lens)
+{
+    const std::size_t D = lens.size();
+    assert(D > 0);
+
+    std::size_t N = 1;
+    for(const auto L : lens)
+        N *= static_cast<std::size_t>(L);
+
+    std::vector<std::vector<T>> out;
+    out.resize(N);
+    for(auto& row : out)
+        row.resize(D);
+
+    std::vector<std::size_t> stride(D, 1);
+    for(std::size_t d = D; d-- > 1;)
+        stride[d - 1] = stride[d] * static_cast<std::size_t>(lens[d]);
+
+    for(std::size_t r = 0; r < N; ++r)
+    {
+        for(std::size_t d = 0; d < D; ++d)
+            out[r][d] = static_cast<T>((r / stride[d]) % static_cast<std::size_t>(lens[d]));
+    }
+
+    return out;
+}
+
+template <typename T>
+static inline T
+linear_to_offset(size_t li, const std::vector<T>& lens, const std::vector<T>& strides)
+{
+    T off = 0;
+    for(int d = int(lens.size()) - 1; d >= 0; --d)
+    {
+        const T idx = li % lens[d];
+        li /= lens[d];
+        off += idx * strides[d];
+    }
+    return off;
+}
+
+template <typename T>
+T get_offset_from_index(const std::vector<T>& strides, const std::vector<T>& index)
+{
+    T offset = 0;
+
+    assert(strides.size() == index.size());
+
+    for(int i = 0; i < index.size(); i++)
+        offset += strides[i] * index[i];
+
+    return (offset);
+};
+
+template <typename T>
+T get_flatten_offset(const std::vector<T>& lengths, const std::vector<T>& index)
+{
+    T offset = 0;
+
+    assert(lengths.size() == index.size() && !lengths.empty());
+
+    int len  = lengths.size();
+    T stride = 1;
+
+    // for len==1, the loop is not executed
+    for(int i = len - 1; i > 0; i--)
+    {
+        offset += stride * index[i];
+
+        stride *= lengths[i];
+    };
+
+    offset += stride * index[0];
+
+    return (offset);
+};
+
+template <typename compType>
+struct Reducer
+{
+    compType acc;
+    bool withIdx;
+    int idx; // meaningful only when WithIdx==true
+    miopenNanPropagation_t nanOpt;
+    // functors for reduction
+    decltype(reduce::ReduceOpFn<compType>(MIOPEN_REDUCE_TENSOR_ADD)) opNoIdx;
+    decltype(reduce::ReduceOpFn2<compType>(MIOPEN_REDUCE_TENSOR_ADD)) opWithIdx;
+
+    Reducer(miopenNanPropagation_t n, miopenReduceTensorOp_t rop, compType zero, bool useIdx)
+        : acc(zero),
+          withIdx(useIdx),
+          idx(0),
+          nanOpt(n),
+          opNoIdx(reduce::ReduceOpFn<compType>(rop)),
+          opWithIdx(reduce::ReduceOpFn2<compType>(rop))
+    {
+    }
+
+    inline void step(compType v, int flat_i)
+    {
+        if(withIdx)
+            reduce::binop_with_nan_check2(nanOpt, opWithIdx, acc, v, idx, flat_i);
+        else
+            reduce::binop_with_nan_check(nanOpt, opNoIdx, acc, v);
+    }
+
+    inline void combine(const Reducer& other)
+    {
+        if(withIdx)
+            reduce::binop_with_nan_check2(nanOpt, opWithIdx, acc, other.acc, idx, other.idx);
+        else
+            reduce::binop_with_nan_check(nanOpt, opNoIdx, acc, other.acc);
+    }
+};
+
+template <typename Tgpu, typename Tref, typename compType, typename SizeT>
+std::tuple<tensor<Tref>, tensor<int>> reduce_cpu_common(const miopenReduceTensorOp_t& reduceOp,
+                                                        const miopenNanPropagation_t& nanOpt,
+                                                        const std::vector<SizeT>& inLengths,
+                                                        const std::vector<SizeT>& outLengths,
+                                                        const std::vector<Tgpu>& input,
+                                                        const std::vector<SizeT>& inStrides,
+                                                        const std::vector<Tref>& output,
+                                                        const std::vector<SizeT>& outStrides,
+                                                        float alpha,
+                                                        float beta,
+                                                        bool parallel,
+                                                        bool withIdx)
+{
+    using reduce::convert_type;
+    using reduce::ReduceOpZeroVal;
+
+    // Partition dims
+    std::vector<int> invariantDims, toReduceDims;
+    std::vector<std::size_t> invLens, redLens, invStrides_v, redStrides_v;
+
+    for(int i = 0; i < static_cast<int>(inLengths.size()); ++i)
+    {
+        if(inLengths[i] == outLengths[i])
+        {
+            invariantDims.push_back(i);
+            invLens.push_back(inLengths[i]);
+            invStrides_v.push_back(inStrides[i]);
+        }
+        else
+        {
+            toReduceDims.push_back(i);
+            redLens.push_back(inLengths[i]);
+            redStrides_v.push_back(inStrides[i]);
+        }
+    }
+
+    const bool reduceAllDims = invariantDims.empty();
+
+    // unary ops & zero vals
+    const compType zeroV = ReduceOpZeroVal<compType>(reduceOp);
+
+    // divider = Π reduced dims (or N if reduce-all)
+    std::size_t divider = 1;
+    if(reduceAllDims)
+        divider = std::accumulate(
+            inLengths.begin(), inLengths.end(), std::size_t{1}, std::multiplies<>());
+    else
+        divider =
+            std::accumulate(redLens.begin(), redLens.end(), std::size_t{1}, std::multiplies<>());
+
+    auto PreUnaryOp = reduce::PreUnaryOpFn<compType>(reduceOp, divider);
+    auto PosUnaryOp = reduce::PosUnaryOpFn<compType>(reduceOp, divider);
+
+    // outputs
+    auto res         = tensor<Tref>{outLengths};
+    res.data         = output;
+    auto res_indices = tensor<int>{outLengths};
+    if(withIdx)
+        std::fill(res_indices.begin(), res_indices.end(), 0);
+
+    if(reduceAllDims)
+    {
+        // Flatten whole tensor
+        const std::size_t N = divider; // product of all dims
+        std::vector<std::size_t> lens_radix;
+        reduce::build_radix(inLengths, lens_radix);
+
+        // parallel chunking
+        std::size_t hw =
+            std::max(std::size_t{1}, static_cast<std::size_t>(std::thread::hardware_concurrency()));
+        const std::size_t P     = std::min(N, hw * 4ul);
+        const std::size_t chunk = (N + P - 1) / P;
+
+        std::vector<Reducer<compType>> partial;
+        partial.reserve(P);
+        for(std::size_t p = 0; p < P; ++p)
+            partial.emplace_back(nanOpt, reduceOp, zeroV, withIdx);
+
+        auto worker = [&](int p) {
+            const std::size_t begin = std::size_t(p) * chunk;
+            const std::size_t end   = std::min(begin + chunk, N);
+
+            auto& r = partial[p];
+            for(std::size_t i = begin; i < end; ++i)
+            {
+                const auto off =
+                    reduce::linear_to_offset_by_lens_strides(i, inLengths, lens_radix, inStrides);
+                auto v = convert_type<compType>(input[off]);
+                PreUnaryOp(v);
+                r.step(v, static_cast<int>(i)); // flat index across whole tensor
+            }
+        };
+
+        if(parallel)
+        {
+            miopen::par_for(static_cast<int>(P), worker);
+        }
+        else
+        {
+            for(int p = 0; p < P; ++p)
+            {
+                worker(p);
+            }
+        }
+
+        // combine
+        Reducer<compType> R(nanOpt, reduceOp, zeroV, withIdx);
+        for(std::size_t p = 0; p < P; ++p)
+            R.combine(partial[p]);
+
+        // post
+        PosUnaryOp(R.acc);
+        if(alpha != 1.0f)
+            R.acc *= convert_type<compType>(alpha);
+        if(beta != 0.0f)
+            R.acc += convert_type<compType>(output[0]) * convert_type<compType>(beta);
+
+        res.data[0] = convert_type<Tref>(R.acc);
+        if(withIdx)
+            res_indices.data[0] = R.idx;
+    }
+    else
+    {
+        // Build radices for invariant and reduced subspaces
+        std::vector<std::size_t> invRad, redRad;
+        reduce::build_radix(invLens, invRad);
+        reduce::build_radix(redLens, redRad);
+
+        const std::size_t INV =
+            std::accumulate(invLens.begin(), invLens.end(), std::size_t{1}, std::multiplies<>());
+        const std::size_t TR = divider;
+
+        std::size_t hw =
+            std::max(std::size_t{1}, static_cast<std::size_t>(std::thread::hardware_concurrency()));
+        const std::size_t Te    = std::min(hw * 4ul, std::max<std::size_t>(1, INV));
+        const std::size_t chunk = (INV + Te - 1) / Te;
+
+        auto worker = [&](int t) {
+            const std::size_t row0 = std::size_t(t) * chunk;
+            const std::size_t row1 = std::min(row0 + chunk, INV);
+
+            for(std::size_t r = row0; r < row1; ++r)
+            {
+                // decode invariant multi-index; compute base offsets
+                std::size_t tmp          = r;
+                std::size_t base_in_off  = 0;
+                std::size_t base_out_off = 0;
+                for(std::size_t k = 0; k < invLens.size(); ++k)
+                {
+                    const std::size_t idx = (tmp / invRad[k]) % invLens[k];
+                    base_in_off += idx * invStrides_v[k];
+                    base_out_off += idx * outStrides[invariantDims[k]];
+                }
+
+                Reducer<compType> R(nanOpt, reduceOp, zeroV, withIdx);
+
+                // iterate reduced subspace
+                for(std::size_t i = 0; i < TR; ++i)
+                {
+                    std::size_t tmp2    = i;
+                    std::size_t red_off = 0;
+                    for(std::size_t k = 0; k < redLens.size(); ++k)
+                    {
+                        const std::size_t idx = (tmp2 / redRad[k]) % redLens[k];
+                        red_off += idx * redStrides_v[k];
+                    }
+
+                    auto v = convert_type<compType>(input[base_in_off + red_off]);
+                    PreUnaryOp(v);
+                    R.step(v, static_cast<int>(i)); // flat index inside reduced subspace
+                }
+
+                PosUnaryOp(R.acc);
+                if(alpha != 1.0f)
+                    R.acc *= convert_type<compType>(alpha);
+                if(beta != 0.0f)
+                    R.acc +=
+                        convert_type<compType>(output[base_out_off]) * convert_type<compType>(beta);
+
+                res.data[base_out_off] = convert_type<Tref>(R.acc);
+                if(withIdx)
+                    res_indices.data[base_out_off] = R.idx;
+            }
+        };
+
+        if(parallel)
+        {
+            miopen::par_for(static_cast<int>(Te), worker);
+        }
+        else
+        {
+            for(int te = 0; te < Te; ++te)
+            {
+                worker(te);
+            }
+        }
+    }
+
+    return {res, res_indices};
+}
+
+template <typename T, typename compType>
+std::tuple<tensor<T>, tensor<int>>
+reduce_cpu_common(const miopen::ReduceTensorDescriptor& reduceDesc,
+                  const tensor<T>& input,
+                  const tensor<T>& output,
+                  float alpha,
+                  float beta,
+                  bool parallel,
+                  bool withIdx)
+{
+    auto inLengths  = input.desc.GetLengths();
+    auto outLengths = output.desc.GetLengths();
+    auto inStrides  = input.desc.GetStrides();
+    auto outStrides = output.desc.GetStrides();
+
+    const auto reduceOp = reduceDesc.reduceTensorOp_;
+    const auto nanOpt   = reduceDesc.reduceTensorNanOpt_;
+
+    return reduce_cpu_common<T, T, compType, std::size_t>(reduceOp,
+                                                          nanOpt,
+                                                          inLengths,
+                                                          outLengths,
+                                                          input.data,
+                                                          inStrides,
+                                                          output.data,
+                                                          outStrides,
+                                                          alpha,
+                                                          beta,
+                                                          parallel,
+                                                          withIdx);
+}
+
+#endif
diff --git a/projects/miopen/miopen_utils/include/miopen_utils/fusionHost.hpp b/projects/miopen/miopen_utils/include/miopen_utils/fusionHost.hpp
new file mode 100644
index 000000000000..2d1d33cc898a
--- /dev/null
+++ b/projects/miopen/miopen_utils/include/miopen_utils/fusionHost.hpp
@@ -0,0 +1,993 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2018 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#pragma once
+#include <array>
+#include <iostream>
+#include <iterator>
+#include <limits>
+#include <memory>
+#include <utility>
+#include <miopen/miopen.h>
+#include <miopen/convolution.hpp>
+#include <miopen/batch_norm.hpp>
+#include <miopen/activ.hpp>
+#include <miopen/tensor.hpp>
+#include <miopen/fusion_plan.hpp>
+#include <miopen_utils/tensor_holder.hpp>
+#include <miopen_utils/verify.hpp>
+
+template <class T>
+void convHostForward(const tensor<T>& input,
+                     tensor<T>& output,
+                     const tensor<T>& weights,
+                     const int bias_mode,
+                     const tensor<T>& bias,
+                     const miopenConvolutionDescriptor_t convDesc)
+{
+
+    int in_n, in_c, in_h, in_w;
+    int in_nstride, in_cstride, in_hstride, in_wstride;
+    std::tie(in_n, in_c, in_h, in_w) = miopen::tien<4>(input.desc.GetLengths());
+    std::tie(in_nstride, in_cstride, in_hstride, in_wstride) =
+        miopen::tien<4>(input.desc.GetStrides());
+
+    int wei_n, wei_c, wei_h, wei_w;
+    int wei_nstride, wei_cstride, wei_hstride, wei_wstride;
+    std::tie(wei_n, wei_c, wei_h, wei_w) = miopen::tien<4>(weights.desc.GetLengths());
+    std::tie(wei_nstride, wei_cstride, wei_hstride, wei_wstride) =
+        miopen::tien<4>(weights.desc.GetStrides());
+
+    int out_n, out_c, out_h, out_w;
+    int out_nstride, out_cstride, out_hstride, out_wstride;
+    std::tie(out_n, out_c, out_h, out_w) = miopen::tien<4>(output.desc.GetLengths());
+    std::tie(out_nstride, out_cstride, out_hstride, out_wstride) =
+        miopen::tien<4>(output.desc.GetStrides());
+
+    int stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w;
+    miopenConvolutionMode_t mode;
+    miopenPaddingMode_t pmode = miopen::deref(convDesc).paddingMode;
+    miopenGetConvolutionDescriptor(
+        convDesc, &mode, &pad_h, &pad_w, &stride_h, &stride_w, &dilation_h, &dilation_w);
+
+    if(pmode == miopenPaddingSame)
+    {
+        pad_h = (in_h % stride_h == 0) ? (std::max((wei_h - stride_h), 0))
+                                       : (std::max((wei_h - (in_h % stride_h)), 0));
+        pad_w = (in_w % stride_w == 0) ? (std::max((wei_w - stride_w), 0))
+                                       : (std::max((wei_w - (in_w % stride_w)), 0));
+        pad_h /= 2;
+        pad_w /= 2;
+    }
+    else if(pmode == miopenPaddingValid)
+    {
+        pad_h = 0;
+        pad_w = 0;
+    }
+
+    if(out_h <= 0 || out_w <= 0)
+        MIOPEN_THROW("Invalid Test Case: Check Output Dimension.");
+
+    for(int o = 0; o < out_n; o++)
+    { // mini-batch size
+        for(int w = 0; w < out_c; w++)
+        { // out_channels (num filters)
+            for(int i = 0; i < out_h; i++)
+            { // output_height (from getforwardoutputdim())
+                int in_off_h = i * stride_h;
+                for(int j = 0; j < out_w; j++)
+                { // output_width (from getforwardoutputdim())
+                    /*auto acc     = static_cast<T>(0.);*/
+                    auto acc     = static_cast<double>(0.);
+                    int in_off_w = j * stride_w;
+                    for(int k = 0; k < in_c; k++)
+                    { // in_channels (RGB)
+                        for(int x = 0; x < wei_h; x++)
+                        {
+                            int in_x = in_off_h - pad_h + x * dilation_h;
+                            if(in_x >= 0 && in_x < in_h)
+                            {
+                                for(int y = 0; y < wei_w; y++)
+                                {
+                                    int in_y = in_off_w - pad_w + y * dilation_w;
+                                    if(in_y >= 0 && in_y < in_w)
+                                    {
+                                        acc += double(
+                                            static_cast<T>(input[o * in_nstride + k * in_cstride +
+                                                                 in_x * in_w + in_y]) *
+                                            static_cast<T>(weights(w, k, x, y)));
+                                    }
+                                }
+                            }
+                        }
+                    }
+                    acc = bias_mode != 0 ? acc + static_cast<double>(bias[w]) : acc;
+                    output[o * out_nstride + w * out_cstride + i * out_hstride + j] =
+                        static_cast<T>(acc);
+                }
+            }
+        }
+    }
+}
+
+template <class T, class Tref, class U, class V = U>
+void batchNormSpatialHostInference(const tensor<T>& input,
+                                   tensor<Tref>& output,
+                                   const tensor<U>& scale,
+                                   const tensor<U>& bias,
+                                   double epsilon,
+                                   const tensor<V>& estimatedMean,
+                                   const tensor<V>& estimatedVariance,
+                                   bool useInverseVariance = false)
+{
+
+    int n_batches, channels, height, width;
+    std::tie(n_batches, channels, height, width) = miopen::tien<4>(input.desc.GetLengths());
+    miopen::par_for(channels, 1, [&](int cidx) { // via channel
+        V mean     = estimatedMean(0, cidx, 0, 0);
+        V variance = estimatedVariance(0, cidx, 0, 0);
+        double invertVar =
+            useInverseVariance ? static_cast<double>(variance) : 1.0 / sqrt(variance + epsilon);
+        // process the batch per channel
+        for(int row = 0; row < height; row++)
+        { // via rows
+            for(int column = 0; column < width; column++)
+            { // via columns
+                for(int bidx = 0; bidx < n_batches; bidx++)
+                { // via mini_batch
+                    double elemStd = static_cast<double>(input(bidx, cidx, row, column)) - mean;
+                    double inhat   = elemStd * invertVar;
+                    output(bidx, cidx, row, column) =
+                        static_cast<T>(scale(0, cidx, 0, 0) * inhat + bias(0, cidx, 0, 0));
+                    // printf("output: %f\n",scale(0, cidx, 0, 0) * inhat + bias(0, cidx, 0, 0));
+                }
+            }
+        }
+    });
+}
+
+template <class T, class U, class V, class Tref>
+void batchNormPerActivHostInference(const tensor<T>& input,
+                                    tensor<Tref>& output,
+                                    const tensor<U>& scale,
+                                    const tensor<U>& bias,
+                                    double epsilon,
+                                    const tensor<V>& estimatedMean,
+                                    const tensor<V>& estimatedVariance,
+                                    bool useInverseVariance = false)
+{
+    int n_batches, channels, height, width;
+    std::tie(n_batches, channels, height, width) = miopen::tien<4>(input.desc.GetLengths());
+    miopen::par_for(channels, 1, [&](int cidx) { // via channel
+        for(int row = 0; row < height; row++)
+        { // via rows
+            for(int column = 0; column < width; column++)
+            { // via columns
+                // apply down the n_batch dimension
+                double mean       = estimatedMean(0, cidx, row, column);
+                double variance   = estimatedVariance(0, cidx, row, column);
+                double elemInvVar = useInverseVariance ? variance : 1.0 / sqrt(variance + epsilon);
+                for(int bidx = 0; bidx < n_batches; bidx++)
+                { // via mini_batch
+                    // per (x-dims) channel load a block of data into LDS
+                    double elemStd = input(bidx, cidx, row, column) - mean;
+                    double inhat   = elemStd * elemInvVar;
+                    output(bidx, cidx, row, column) =
+                        scale(0, cidx, row, column) * inhat + bias(0, cidx, row, column);
+                    //    printf("output: %f\n",output(bidx, cidx, row, column));
+                }
+            }
+        }
+    });
+}
+
+template <class T, class U, class Tref = U, class Tout>
+void batchNormSpatialHostFwdTrain(const tensor<T>& input,
+                                  tensor<Tout>& out,
+                                  const tensor<U>& scale,
+                                  const tensor<U>& bias,
+                                  double epsilon,
+                                  double expAvgFactor,
+                                  tensor<Tref>& saveMean,
+                                  tensor<Tref>& saveInvVar,
+                                  tensor<Tref>& runMean,
+                                  tensor<Tref>& runVar)
+{
+
+    int height, width, n_batch, channels;
+    std::tie(n_batch, channels, height, width) = miopen::tien<4>(input.desc.GetLengths());
+    const auto nhw                             = double(height * width * n_batch);
+
+    miopen::par_for(channels, 1, [&](int cidx) {
+        double elemStd        = 0.;
+        double variance_accum = 0.;
+        double mean_accum     = 0.;
+        double invVar         = 0.;
+        double newRunMean     = 0.;
+        double adjust         = 0.;
+
+        // process the batch per channel
+        for(int bidx = 0; bidx < n_batch; bidx++)
+        { // via mini_batch
+            for(int row = 0; row < height; row++)
+            { // via rows
+                for(int column = 0; column < width; column++)
+                { // via columns
+                    // #1 calculate the mean
+                    // iterating through the stack of images in the mini_batch
+                    auto inval = static_cast<double>(input(bidx, cidx, row, column));
+                    mean_accum += inval;
+                    variance_accum += inval * inval;
+                } // end for (column)
+            } // end for (row)
+        } // end for (n)
+
+        mean_accum /= nhw;
+        variance_accum /= nhw;
+        variance_accum += (-mean_accum * mean_accum);
+        invVar = 1.0 / sqrt(variance_accum + epsilon);
+
+        // #4 apply the normalization
+        // x_hat = (x_i - mean) / sqrt(variance_accum + epsilon)
+        for(int bidx = 0; bidx < n_batch; bidx++)
+        { // via mini_batch
+            for(int row = 0; row < height; row++)
+            { // via rows
+                for(int column = 0; column < width; column++)
+                { // via columns
+                    // #5 Gamma and Beta adjust
+                    // y_i = gamma*x_hat + beta
+                    elemStd = (static_cast<double>(input(bidx, cidx, row, column)) -
+                               mean_accum); // (x_i - mean)
+                    out(bidx, cidx, row, column) = static_cast<T>(
+                        scale(0, cidx, 0, 0) * (invVar * elemStd) + bias(0, cidx, 0, 0));
+                } // for (column)
+            } // for (row)
+        } // end for(n_batchs)
+        if(!saveMean.data.empty())
+        {
+            saveMean(0, cidx, 0, 0)   = mean_accum;
+            saveInvVar(0, cidx, 0, 0) = invVar;
+        }
+        if(!runMean.data.empty())
+        {
+            newRunMean             = runMean(0, cidx, 0, 0) * (1 - expAvgFactor);
+            runMean(0, cidx, 0, 0) = mean_accum * expAvgFactor + newRunMean; // newMean*factor + tmp
+            // var(n+1) = p * var(n-1) + (1 - p)*(b/b-1)*var(n)
+            adjust = (n_batch * height * width == 1) ? variance_accum
+                                                     : (nhw / (nhw - 1)) * variance_accum;
+            runVar(0, cidx, 0, 0) =
+                (1 - expAvgFactor) * runVar(0, cidx, 0, 0) + expAvgFactor * adjust;
+        }
+    });
+}
+
+template <typename XDataType,
+          typename DxDataType,
+          typename DyDataType,
+          typename ScaleDataType,
+          typename AccDataType,
+          typename RefDataType>
+void batchNormSpatialHostBwdTrain(const tensor<XDataType>& x_input,
+                                  tensor<DyDataType>& dy_input,
+                                  tensor<DxDataType>& dx_out,
+                                  const tensor<ScaleDataType>& bnScale,
+                                  const tensor<ScaleDataType>& bnBias,
+                                  tensor<RefDataType>& dscale,
+                                  tensor<RefDataType>& dbias,
+                                  const tensor<AccDataType>& savedMean,
+                                  const tensor<AccDataType>& savedInvVar,
+                                  miopenActivationMode_t activ_mode,
+                                  double activ_beta,
+                                  double activ_alpha)
+{
+    double activ_gamma = 0.;
+    int height, width, n_batch, channels;
+    std::tie(n_batch, channels, height, width) = miopen::tien<4>(x_input.desc.GetLengths());
+    auto nhw                                   = double(height * width * n_batch);
+    int in_cstride                             = height * width;
+
+    if(activ_mode > 0)
+    {
+        tensor<AccDataType> input_norm =
+            tensor<AccDataType>{x_input.desc.GetLayout_t(), x_input.desc.GetLengths()};
+        miopen::par_for(channels, 1, [&](int cidx) {
+            double mean           = 0.0;
+            double invVar         = 0.0;
+            double elemStd        = 0.;
+            double mean_accum     = 0.0;
+            double variance_accum = 0.0;
+            if(!savedMean.data.empty())
+            {
+                mean   = static_cast<double>(savedMean(0, cidx, 0, 0));   // HxW elements
+                invVar = static_cast<double>(savedInvVar(0, cidx, 0, 0)); // HxW elements
+            }
+            else
+            {
+                for(int row = 0; row < height; row++)
+                { // via rows
+                    for(int column = 0; column < width; column++)
+                    { // via columns
+                        for(int bidx = 0; bidx < n_batch; bidx++)
+                        { // via mini_batch
+                            auto inval = static_cast<double>(x_input(bidx, cidx, row, column));
+                            mean_accum += inval;
+                            variance_accum += inval * inval;
+                        }
+                    }
+                }
+                mean_accum /= nhw;
+                variance_accum /= nhw;
+                variance_accum += (-mean_accum * mean_accum);
+                mean   = mean_accum;
+                invVar = 1.0 / sqrt(variance_accum);
+            }
+            for(int row = 0; row < height; row++)
+            { // via rows
+                for(int column = 0; column < width; column++)
+                { // via columns
+                    for(int bidx = 0; bidx < n_batch; bidx++)
+                    { // via mini_batch
+                        elemStd = static_cast<double>(x_input(bidx, cidx, row, column)) -
+                                  mean; // (x_i - mean)
+                        input_norm(bidx, cidx, row, column) = static_cast<AccDataType>(
+                            bnScale(0, cidx, 0, 0) * (elemStd * invVar) + bnBias(0, cidx, 0, 0));
+                    }
+                }
+            }
+        });
+
+        activationHostBnormBwd(activ_mode,
+                               activ_gamma,
+                               activ_beta,
+                               activ_alpha,
+                               dy_input.data,
+                               input_norm.data,
+                               dy_input.data);
+    }
+
+    miopen::par_for(channels, 1, [&](int cidx) {
+        double elemStd = 0.;
+        unsigned int xhat_index;
+        double mean   = 0.0;
+        double invVar = 0.0;
+        double dyelem = 0.;
+        std::vector<double> xhat(static_cast<std::size_t>(n_batch) * in_cstride, 0.0);
+        // process the batch per channel
+        dscale(0, cidx, 0, 0) = 0.;
+        dbias(0, cidx, 0, 0)  = 0.;
+
+        if(!savedMean.data.empty())
+        {
+
+            mean   = savedMean(0, cidx, 0, 0);   // HxW elements
+            invVar = savedInvVar(0, cidx, 0, 0); // HxW elements
+        }
+        else
+        {
+            double variance_accum = 0.;
+            double mean_accum     = 0.;
+            double inv_Var        = 0.;
+
+            // process the batch per channel
+            for(int bidx = 0; bidx < n_batch; bidx++)
+            { // via mini_batch
+                for(int row = 0; row < height; row++)
+                { // via rows
+                    for(int column = 0; column < width; column++)
+                    { // via columns
+                        // #1 calculate the mean
+                        // iterating through the stack of images in the mini_batch
+                        auto inval = static_cast<double>(x_input(bidx, cidx, row, column));
+                        mean_accum += inval;
+                        variance_accum += inval * inval;
+                    } // end for (column)
+                } // end for (row)
+            } // end for (n)
+
+            mean_accum /= nhw;
+            variance_accum /= nhw;
+            variance_accum += (-mean_accum * mean_accum);
+            inv_Var = 1.0 / sqrt(variance_accum);
+
+            mean   = mean_accum;
+            invVar = inv_Var;
+        }
+        for(int row = 0; row < height; row++)
+        { // via rows
+            for(int column = 0; column < width; column++)
+            { // via columns
+                for(int bidx = 0; bidx < n_batch; bidx++)
+                { // via mini_batch
+                    xhat_index = in_cstride * bidx + (width * row + column);
+                    // per (x-dims) channel load a block of data into LDS
+                    elemStd = static_cast<double>(x_input(bidx, cidx, row, column)) -
+                              mean; // (x_i - mean)
+                    xhat[xhat_index] = elemStd * invVar;
+                    dyelem           = static_cast<double>(dy_input(bidx, cidx, row, column));
+                    dbias(0, cidx, 0, 0) += dyelem;
+                    dscale(0, cidx, 0, 0) += xhat[xhat_index] * dyelem;
+                } // end for(n_batch)
+            } // for (column)
+        } // for (row)
+
+        for(int row = 0; row < height; row++)
+        { // via rows
+            for(int column = 0; column < width; column++)
+            { // via columns
+                for(int bidx = 0; bidx < n_batch; bidx++)
+                { // via mini_batch
+                    xhat_index = in_cstride * bidx + (width * row + column);
+
+                    double tmp1 = nhw * dy_input(bidx, cidx, row, column) - dbias(0, cidx, 0, 0);
+                    double tmp2 = -xhat[xhat_index] * dscale(0, cidx, 0, 0);
+                    double tmp3 = (bnScale(0, cidx, 0, 0) * invVar) / nhw;
+                    dx_out(bidx, cidx, row, column) =
+                        static_cast<RefDataType>(tmp3 * (tmp2 + tmp1));
+                } // end for(n_batchs)
+            } // for (column)
+        } // for (row)
+    }); // for (channel)
+}
+
+template <typename XDataType,
+          typename DyDataType,
+          typename DxDataType,
+          typename ScaleDataType,
+          typename AccDataType,
+          typename OutRefDataType,
+          typename RefDataType>
+void batchNormActivSpatialHostBwdTrain(miopenActivationMode_t activMode,
+                                       double gamma,
+                                       double beta,
+                                       double alpha,
+                                       const tensor<XDataType>& x_input,
+                                       const tensor<DyDataType>& dy_input,
+                                       const tensor<DxDataType>& y_input,
+                                       tensor<OutRefDataType>& dx_out,
+                                       const tensor<ScaleDataType>& bnScale,
+                                       const tensor<AccDataType>& bias,
+                                       tensor<RefDataType>& dscale,
+                                       tensor<RefDataType>& dbias,
+                                       const tensor<AccDataType>& savedMean,
+                                       const tensor<AccDataType>& savedInvVar)
+{
+
+    int height, width, n_batch, channels;
+    std::tie(n_batch, channels, height, width) = miopen::tien<4>(x_input.desc.GetLengths());
+    auto nhw                                   = double(height * width * n_batch);
+    int in_cstride                             = height * width;
+
+    miopen::par_for(channels, 1, [&](int cidx) {
+        double elemStd = 0.;
+        unsigned int xhat_index;
+        double mean   = static_cast<double>(savedMean(0, cidx, 0, 0));   // HxW elements
+        double invVar = static_cast<double>(savedInvVar(0, cidx, 0, 0)); // HxW elements
+        double dyelem = 0.;
+        std::vector<double> xhat(static_cast<std::size_t>(n_batch) * in_cstride, 0.0);
+        // process the batch per channel
+        dscale(0, cidx, 0, 0) = 0.;
+        dbias(0, cidx, 0, 0)  = 0.;
+
+        for(int row = 0; row < height; row++)
+        { // via rows
+            for(int column = 0; column < width; column++)
+            { // via columns
+                for(int bidx = 0; bidx < n_batch; bidx++)
+                { // via mini_batch
+
+                    // recompute forward batch norm
+                    xhat_index = in_cstride * bidx + (width * row + column);
+                    // per (x-dims) channel load a block of data into LDS
+                    elemStd = static_cast<double>(x_input(bidx, cidx, row, column)) -
+                              mean; // (x_i - mean)
+                    xhat[xhat_index] = elemStd * invVar;
+                    double bnrefowd =
+                        bnScale(0, cidx, 0, 0) * xhat[xhat_index] + bias(0, cidx, 0, 0);
+                    activationHostBwdElement(activMode,
+                                             gamma,
+                                             beta,
+                                             alpha,
+                                             dy_input(bidx, cidx, row, column),
+                                             bnrefowd,
+                                             y_input(bidx, cidx, row, column),
+                                             dyelem);
+                    dbias(0, cidx, 0, 0) += dyelem;
+                    dscale(0, cidx, 0, 0) += xhat[xhat_index] * dyelem;
+                } // end for(n_batch)
+            } // for (column)
+        } // for (row)
+
+        for(int row = 0; row < height; row++)
+        { // via rows
+            for(int column = 0; column < width; column++)
+            { // via columns
+                for(int bidx = 0; bidx < n_batch; bidx++)
+                { // via mini_batch
+                    xhat_index = in_cstride * bidx + (width * row + column);
+                    double bnrefowd =
+                        bnScale(0, cidx, 0, 0) * xhat[xhat_index] + bias(0, cidx, 0, 0);
+                    activationHostBwdElement(activMode,
+                                             gamma,
+                                             beta,
+                                             alpha,
+                                             dy_input(bidx, cidx, row, column),
+                                             bnrefowd,
+                                             y_input(bidx, cidx, row, column),
+                                             dyelem);
+                    // double tmp1 = nhw * dy_input(bidx, cidx, row, column) - dbias(0, cidx, 0, 0);
+                    double tmp1                     = nhw * dyelem - dbias(0, cidx, 0, 0);
+                    double tmp2                     = -xhat[xhat_index] * dscale(0, cidx, 0, 0);
+                    double tmp3                     = (bnScale(0, cidx, 0, 0) * invVar) / nhw;
+                    dx_out(bidx, cidx, row, column) = static_cast<DxDataType>(tmp3 * (tmp2 + tmp1));
+                } // end for(n_batchs)
+            } // for (column)
+        } // for (row)
+    }); // for (channel)
+}
+
+template <class T, class U, class Tref, class TOutref>
+void batchNormPerActHostFwdTrain(const tensor<T>& input,
+                                 tensor<TOutref>& out,
+                                 const tensor<U>& scale,
+                                 const tensor<U>& bias,
+                                 double epsilon,
+                                 double expAvgFactor,
+                                 tensor<Tref>& saveMean,
+                                 tensor<Tref>& saveInvVar,
+                                 tensor<Tref>& runMean,
+                                 tensor<Tref>& runVar)
+{
+
+    int height, width, n_batch, channels;
+    std::tie(n_batch, channels, height, width) = miopen::tien<4>(input.desc.GetLengths());
+    const auto n                               = double(n_batch);
+
+    miopen::par_for(channels, 1, [&](int cidx) {
+        double mean_accum     = 0.;
+        double variance_accum = 0.;
+        double elemStd        = 0.;
+        double elemInvVar     = 0.;
+        double inhat          = 0.;
+        double newRunMean     = 0.;
+        double adjust         = 0.;
+
+        // process the batch per channel
+        for(int row = 0; row < height; row++)
+        { // via rows
+            for(int column = 0; column < width; column++)
+            { // via columns
+
+                mean_accum     = 0.;
+                variance_accum = 0.;
+                for(int bidx = 0; bidx < n_batch; bidx++)
+                { // via mini_batch
+                    // #1 calculate the mean :: iterating through the stack of images in the
+                    // mini_batch
+                    auto intval = static_cast<double>(input(bidx, cidx, row, column));
+                    mean_accum += intval;
+                    variance_accum += intval * intval;
+                }
+                mean_accum /= n;
+                variance_accum /= n;
+                variance_accum = variance_accum - (mean_accum * mean_accum);
+                elemInvVar     = 1.0 / double(sqrt(variance_accum + epsilon));
+
+                // #4 apply the normalization :: x_hat = (x_i - mean) / sqrt(variance_accum -
+                // epsilon)
+                for(int bidx = 0; bidx < n_batch; bidx++)
+                {                                                            // via mini_batch
+                    elemStd = (input(bidx, cidx, row, column) - mean_accum); // (x_i - mean)
+                    inhat   = elemStd * elemInvVar;
+                    // #5 Gamma and Beta adjust :: y_i = gamma*x_hat + beta
+                    out(bidx, cidx, row, column) = static_cast<Tref>(
+                        scale(0, cidx, row, column) * inhat + bias(0, cidx, row, column));
+                } // end for(n_batch)
+
+                if(!runMean.data.empty())
+                {
+                    newRunMean = runMean(0, cidx, row, column) * (1.0 - expAvgFactor);
+                    runMean(0, cidx, row, column) =
+                        mean_accum * expAvgFactor + newRunMean; // newMean*factor + tmp
+                }
+                // var(n+1) = p * var(n-1) + (1 - p)*(b/b-1)*var(n)
+                if(!runVar.data.empty())
+                {
+                    adjust = (n_batch == 1) ? variance_accum : (n / (n - 1.0)) * variance_accum;
+                    runVar(0, cidx, row, column) =
+                        (1 - expAvgFactor) * runVar(0, cidx, row, column) + expAvgFactor * adjust;
+                }
+                if(!saveMean.data.empty() || !saveInvVar.data.empty())
+                {
+                    saveMean(0, cidx, row, column)   = static_cast<Tref>(mean_accum);
+                    saveInvVar(0, cidx, row, column) = static_cast<Tref>(elemInvVar);
+                }
+
+            } // for (column)
+        } // for (row)
+    });
+}
+
+template <typename XDataType,
+          typename DxDataType,
+          typename DyDataType = XDataType,
+          typename ScaleDataType,
+          typename AccDataType = ScaleDataType,
+          typename RefDataType = DxDataType>
+void batchNormPerActHostBwdTrain(const tensor<XDataType>& x_input,
+                                 const tensor<DyDataType>& dy_input,
+                                 tensor<DxDataType>& dx_out,
+                                 const tensor<ScaleDataType>& scale,
+                                 tensor<RefDataType>& dscale,
+                                 tensor<RefDataType>& dbias,
+                                 const tensor<AccDataType>& savedMean,
+                                 const tensor<AccDataType>& savedInvVar)
+{
+
+    int height, width, n_batch, channels;
+    std::tie(n_batch, channels, height, width) = miopen::tien<4>(x_input.desc.GetLengths());
+    int in_cstride                             = height * width;
+    auto n                                     = double(n_batch);
+
+    miopen::par_for(channels, 1, [&](int cidx) {
+        double elemStd = 0.;
+        unsigned int xhat_index;
+        double mean       = 0.;
+        double elemInvVar = 0.;
+        double dyelem     = 0.;
+        double dxhat      = 0.;
+        double dxhathat   = 0.;
+        double tmp1       = 0.;
+        std::vector<double> xhat(static_cast<std::size_t>(n_batch) * in_cstride);
+
+        // process the batch per channel
+        for(int row = 0; row < height; row++)
+        { // via rows
+            for(int column = 0; column < width; column++)
+            { // via columns
+                dxhat    = 0.;
+                dxhathat = 0.;
+
+                if(!savedMean.data.empty())
+                {
+                    mean       = savedMean(0, cidx, row, column);   // HxW elements
+                    elemInvVar = savedInvVar(0, cidx, row, column); // HxW elements
+                }
+                else
+                {
+                    double variance_accum = 0.;
+                    double mean_accum     = 0.;
+
+                    // process the batch per channel
+                    for(int bidx = 0; bidx < n_batch; bidx++)
+                    { // via mini_batch
+                        auto inval = static_cast<double>(x_input(bidx, cidx, row, column));
+                        mean_accum += inval;
+                        variance_accum += inval * inval;
+                    } // end for (n)
+
+                    mean_accum /= n;
+                    variance_accum /= n;
+                    variance_accum += (-mean_accum * mean_accum);
+
+                    mean       = mean_accum;
+                    elemInvVar = 1.0 / sqrt(variance_accum);
+                }
+
+                for(int bidx = 0; bidx < n_batch; bidx++)
+                { // via mini_batch
+                    xhat_index = in_cstride * bidx + (width * row + column);
+                    // per (x-dims) channel load a block of data into LDS
+                    elemStd = static_cast<double>(x_input(bidx, cidx, row, column)) -
+                              mean; // (x_i - mean)
+                    xhat[xhat_index] = elemStd * elemInvVar;
+                    dyelem           = static_cast<double>(dy_input(bidx, cidx, row, column));
+                    dbias(0, cidx, row, column) += dyelem;
+                    dscale(0, cidx, row, column) += xhat[xhat_index] * dyelem;
+                    tmp1 = scale(0, cidx, row, column) * dyelem;
+                    dxhat += tmp1;
+                    dxhathat += tmp1 * xhat[xhat_index];
+
+                } // end for(n_batchs)
+
+                for(int bidx = 0; bidx < n_batch; bidx++)
+                { // via mini_batch
+                    xhat_index = in_cstride * bidx + (width * row + column);
+                    tmp1       = xhat[xhat_index] * dxhathat + dxhat;
+                    double tmp2 =
+                        n_batch * scale(0, cidx, row, column) * dy_input(bidx, cidx, row, column) -
+                        tmp1;
+                    double tmp3                     = elemInvVar / (double(n));
+                    dx_out(bidx, cidx, row, column) = static_cast<DxDataType>(tmp3 * tmp2);
+                } // end for(n_batchs)
+            } // for (column)
+        } // for (row)
+    });
+}
+
+template <class T, class U>
+void batchNormActivPerActHostBwdTrain(miopenActivationMode_t activMode,
+                                      double gamma,
+                                      double beta,
+                                      double alpha,
+                                      const tensor<T>& x_input,
+                                      const tensor<T>& dy_input,
+                                      const tensor<T>& y_input,
+                                      tensor<T>& dx_out,
+                                      const tensor<U>& scale,
+                                      const tensor<U>& bias,
+                                      tensor<U>& dscale,
+                                      tensor<U>& dbias,
+                                      const tensor<U>& savedMean,
+                                      const tensor<U>& savedInvVar)
+{
+
+    int height, width, n_batch, channels;
+    std::tie(n_batch, channels, height, width) = miopen::tien<4>(x_input.desc.GetLengths());
+    int in_cstride                             = height * width;
+    auto n                                     = double(n_batch);
+
+    miopen::par_for(channels, 1, [&](int cidx) {
+        double elemStd = 0.;
+        unsigned int xhat_index;
+        double mean       = 0.;
+        double elemInvVar = 0.;
+        double dyelem     = 0.;
+        double dxhat      = 0.;
+        double dxhathat   = 0.;
+        double tmp1       = 0.;
+        std::vector<double> xhat(static_cast<std::size_t>(n_batch) * in_cstride);
+
+        // process the batch per channel
+        for(int row = 0; row < height; row++)
+        { // via rows
+            for(int column = 0; column < width; column++)
+            { // via columns
+                dxhat    = 0.;
+                dxhathat = 0.;
+
+                mean       = savedMean(0, cidx, row, column);   // HxW elements
+                elemInvVar = savedInvVar(0, cidx, row, column); // HxW elements
+
+                for(int bidx = 0; bidx < n_batch; bidx++)
+                { // via mini_batch
+                    xhat_index = in_cstride * bidx + (width * row + column);
+                    // per (x-dims) channel load a block of data into LDS
+                    elemStd = static_cast<double>(x_input(bidx, cidx, row, column)) -
+                              mean; // (x_i - mean)
+                    xhat[xhat_index] = elemStd * elemInvVar;
+                    double bnrefowd =
+                        scale(0, cidx, row, column) * xhat[xhat_index] + bias(0, cidx, row, column);
+                    activationHostBwdElement(activMode,
+                                             gamma,
+                                             beta,
+                                             alpha,
+                                             dy_input(bidx, cidx, row, column),
+                                             bnrefowd,
+                                             y_input(bidx, cidx, row, column),
+                                             dyelem);
+                    /*dyelem           = static_cast<double>(dy_input(bidx, cidx, row, column));*/
+                    dbias(0, cidx, row, column) += dyelem;
+                    dscale(0, cidx, row, column) += xhat[xhat_index] * dyelem;
+                    tmp1 = scale(0, cidx, row, column) * dyelem;
+                    dxhat += tmp1;
+                    dxhathat += tmp1 * xhat[xhat_index];
+
+                } // end for(n_batchs)
+
+                for(int bidx = 0; bidx < n_batch; bidx++)
+                { // via mini_batch
+                    xhat_index = in_cstride * bidx + (width * row + column);
+                    tmp1       = xhat[xhat_index] * dxhathat + dxhat;
+                    double bnrefowd =
+                        scale(0, cidx, row, column) * xhat[xhat_index] + bias(0, cidx, row, column);
+                    activationHostBwdElement(activMode,
+                                             gamma,
+                                             beta,
+                                             alpha,
+                                             dy_input(bidx, cidx, row, column),
+                                             bnrefowd,
+                                             y_input(bidx, cidx, row, column),
+                                             dyelem);
+                    double tmp2 = (n_batch * scale(0, cidx, row, column) * dyelem) - tmp1;
+                    double tmp3 = elemInvVar / (double(n));
+                    dx_out(bidx, cidx, row, column) = static_cast<T>(tmp3 * tmp2);
+                } // end for(n_batchs)
+            } // for (column)
+        } // for (row)
+    });
+}
+
+template <class F>
+void visitActivationHostInfer(
+    miopenActivationMode_t activMode, double gamma, double beta, double alpha, F f)
+{
+    switch(activMode)
+    {
+    case miopenActivationPASTHRU: //  x
+        f([=](double x) { return x; });
+        break;
+    case miopenActivationLOGISTIC: // 1 / (1 + e^-x)  //Sigmoid
+        f([=](double x) { return (1. / (1. + std::exp(-x))); });
+        break;
+    case miopenActivationTANH: // beta * tanh(alpha * x)
+        f([=](double x) { return (beta * std::tanh(alpha * x)); });
+        break;
+    case miopenActivationRELU: // max(0, x)
+        f([=](double x) { return ((x > 0.) ? x : 0.); });
+        break;
+    case miopenActivationSOFTRELU: //  log(1 + e^x)   // bonomial normal log likelihood
+        f([=](double x) {
+            return (x > 0.) ? (x + std::log1p(std::exp(-x))) : (std::log1p(std::exp(x)));
+        });
+        break;
+    case miopenActivationABS: //  abs(x)
+        f([=](double x) { return (std::fabs(x)); });
+        break;
+    case miopenActivationPOWER: // (alpha + beta * x) ^ gamma
+        f([=](double x) {
+            auto v = (alpha + beta * x);
+            return (v <= std::numeric_limits<double>::epsilon()) ? 0. : pow(v, gamma);
+        });
+        break;
+    case miopenActivationCLIPPEDRELU: // min(alpha, max(0, x))
+        f([=](double x) { return (std::min(alpha, std::max(double(0.), x))); });
+        break;
+    case miopenActivationLEAKYRELU: // alpha * x | x<=0; x | x>0
+        f([=](double x) { return ((x > 0.) ? x : x * alpha); });
+        break;
+    case miopenActivationELU: // alpha * (exp(x)-1) | x<=0; x | x>0
+        f([=](double x) { return ((x > 0.) ? x : alpha * std::expm1(x)); });
+        break;
+    case miopenActivationCLAMP: // max(alpha, min(beta, x))
+        f([=](double x) { return (std::max(alpha, std::min(beta, x))); });
+        break;
+        // default: printf("ERROR: unknown neuron type: %d\n", activMode); break;
+    }
+}
+
+template <class T>
+inline void activationHostInfer(miopenActivationMode_t activMode,
+                                double gamma,
+                                double beta,
+                                double alpha,
+                                const std::vector<T> input,
+                                std::vector<T>& output)
+{
+    visitActivationHostInfer(activMode, gamma, beta, alpha, [&](auto f) {
+        miopen::par_for(input.size(), 1, [&](int index) {
+            output[index] = static_cast<T>(f(static_cast<double>(input[index])));
+        });
+    });
+}
+
+template <class F>
+void visitActivationHostBwd(
+    miopenActivationMode_t activMode, double gamma, double beta, double alpha, F f)
+{
+    switch(activMode)
+    {
+    case miopenActivationPASTHRU: //  x
+        f([=](double dy, double, double) { return dy; });
+        break;
+    case miopenActivationLOGISTIC: // 1 / (1 + e^-x)  //Sigmoid
+        f([=](double dy, double, double y) { return dy * y * (1 - y); });
+        break;
+    case miopenActivationTANH: // beta * tanh(alpha * x)
+        f([=](double dy, double, double y) { return dy * alpha * (beta - y * y / beta); });
+        break;
+    case miopenActivationRELU: // max(0, x)
+        f([=](double dy, double x, double) { return (x > 0) ? dy : 0; });
+        break;
+    case miopenActivationSOFTRELU: //  log(1 + e^x)   // bonomial normal log likelihood
+        f([=](double dy, double x, double) {
+            static const double threshold = 50.;
+            double expval                 = std::exp(std::min(x, threshold));
+            return dy * expval / (expval + 1.0);
+        });
+        break;
+    case miopenActivationABS: //  abs(x)
+        f([=](double dy, double x, double) { return dy * ((x > 0) ? 1 : -1); });
+        break;
+    case miopenActivationPOWER: // (alpha + beta * x) ^ gamma
+        f([=](double, double x, double y) {
+            auto v = alpha + beta * x;
+            return v <= std::numeric_limits<double>::epsilon() ? 0 : gamma * beta * y / v;
+        });
+        break;
+    case miopenActivationCLIPPEDRELU: // min(alpha, max(0, x))
+        f([=](double dy, double x, double) { return (x > 0 && x <= alpha) ? dy : 0; });
+        break;
+    case miopenActivationLEAKYRELU: // alpha * x | x<=0; x | x>0
+        f([=](double dy, double x, double) { return dy * ((x > 0) ? 1 : alpha); });
+        break;
+    case miopenActivationELU: // alpah * (exp(x)-1) | x<=0; x | x>0
+        f([=](double dy, double x, double y) { return dy * ((x > 0) ? 1 : y + alpha); });
+        break;
+    case miopenActivationCLAMP: // max(alpha, min(beta, x))
+        f([=](double dy, double x, double) { return (x > alpha && x <= beta) ? dy : 0; });
+        break;
+        // default: printf("ERROR: unknown neuron type: %d\n", activMode); break;
+    }
+}
+
+template <class T, class U, class V>
+inline void activationHostBnormBwd(miopenActivationMode_t activMode,
+                                   double gamma,
+                                   double beta,
+                                   double alpha,
+                                   const std::vector<U> dyinput,
+                                   const std::vector<V> xinput,
+                                   std::vector<T>& output)
+{
+    double dummy;
+    visitActivationHostBwd(activMode, gamma, beta, alpha, [&](auto f) {
+        miopen::par_for(dyinput.size(), 1, [&](int index) {
+            output[index] = static_cast<T>(
+                f(static_cast<double>(dyinput[index]), static_cast<double>(xinput[index]), dummy));
+        });
+    });
+}
+
+template <class T>
+inline void activationHostBwd(miopenActivationMode_t activMode,
+                              double gamma,
+                              double beta,
+                              double alpha,
+                              const std::vector<T> dyinput,
+                              const std::vector<T> xinput,
+                              const std::vector<T> yinput,
+                              std::vector<T>& output)
+{
+    visitActivationHostBwd(activMode, gamma, beta, alpha, [&](auto f) {
+        miopen::par_for(dyinput.size(), 1, [&](int index) {
+            output[index] = static_cast<T>(f(static_cast<double>(dyinput[index]),
+                                             static_cast<double>(xinput[index]),
+                                             static_cast<double>(yinput[index])));
+        });
+    });
+}
+
+inline void activationHostBwdElement(miopenActivationMode_t activMode,
+                                     double gamma,
+                                     double beta,
+                                     double alpha,
+                                     const double dyinput,
+                                     const double xinput,
+                                     const double yinput,
+                                     double& output)
+{
+    visitActivationHostBwd(activMode, gamma, beta, alpha, [&](auto f) {
+        output = static_cast<double>(f(dyinput, xinput, yinput));
+    });
+}
+
+template <class T>
+tensor<T> get_output_tensor(const miopen::ConvolutionDescriptor& filter,
+                            const tensor<T>& input,
+                            const tensor<T>& weights)
+{
+    return tensor<T>{filter.GetForwardOutputTensor(input.desc, weights.desc, miopen_type<T>{})};
+}
diff --git a/projects/miopen/miopen_utils/include/miopen_utils/gemm.hpp b/projects/miopen/miopen_utils/include/miopen_utils/gemm.hpp
new file mode 100644
index 000000000000..81c38db0fdf3
--- /dev/null
+++ b/projects/miopen/miopen_utils/include/miopen_utils/gemm.hpp
@@ -0,0 +1,120 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2017 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef GUARD_GEMM_HPP
+#define GUARD_GEMM_HPP
+
+#include <iostream>
+#include <miopen/ford.hpp>
+#include <miopen/errors.hpp>
+
+/*
+    A and B rows and cols should be passed as default values (NxM, MxK), independently of
+   a_transponse/b_transpose flag value
+    C rows and cols should have correct values based on a_transponse/b_transpose values
+    A, B, C strides should have corret values based on a_transponse/b_transpose values
+*/
+template <typename Dtype>
+void gemm_cpu(const Dtype* a_ptr,
+              const size_t a_cols,
+              const size_t a_rows,
+              const size_t a_stride,
+              const bool a_transpose,
+              const Dtype* b_ptr,
+              const size_t b_cols,
+              const size_t b_rows,
+              const size_t b_stride,
+              const bool b_transpose,
+              Dtype* c_ptr,
+              const size_t c_cols,
+              const size_t c_rows,
+              const size_t c_stride,
+              double alpha = 1.0,
+              double beta  = 1.0)
+{
+    if((!a_transpose && !b_transpose &&
+        ((a_cols != b_rows) || (a_rows != c_rows) || (b_cols != c_cols))) ||
+       (a_transpose && b_transpose &&
+        ((a_rows != b_cols) || (a_cols != c_rows) || (b_rows != c_cols))) ||
+       (a_transpose && !b_transpose &&
+        ((a_rows != b_rows) || (a_cols != c_rows) || (b_cols != c_cols))) ||
+       (!a_transpose && b_transpose &&
+        ((a_cols != b_cols) || (a_rows != c_rows) || (b_rows != c_cols))))
+    {
+        MIOPEN_THROW("MM_CPU_ERROR. Incompatible matrix size:\nA: " + std::to_string(a_rows) + "x" +
+                     std::to_string(a_cols) + " transpose: " + (a_transpose ? "true" : "false") +
+                     "\nB: " + std::to_string(b_rows) + "x" + std::to_string(b_cols) +
+                     " transpose: " + (b_transpose ? "true" : "false") +
+                     "\nC: " + std::to_string(c_rows) + "x" + std::to_string(c_cols) + "\n");
+    }
+
+    size_t inner_loop_limit = a_transpose ? a_rows : a_cols;
+    auto inner_loop         = [&](int m, int n) {
+        double el = 0.0;
+        if(!a_transpose && !b_transpose)
+        {
+            miopen::ford(inner_loop_limit)([&](int k) {
+                el += static_cast<double>(a_ptr[m * a_stride + k]) *
+                      static_cast<double>(b_ptr[k * b_stride + n]);
+            });
+        }
+        else if(!a_transpose && b_transpose)
+        {
+            miopen::ford(inner_loop_limit)([&](int k) {
+                el += static_cast<double>(a_ptr[m * a_stride + k]) *
+                      static_cast<double>(b_ptr[n * b_stride + k]);
+            });
+        }
+        else if(a_transpose && !b_transpose)
+        {
+            miopen::ford(inner_loop_limit)([&](int k) {
+                el += static_cast<double>(a_ptr[k * a_stride + m]) *
+                      static_cast<double>(b_ptr[k * b_stride + n]);
+            });
+        }
+        else
+        {
+            miopen::ford(inner_loop_limit)([&](int k) {
+                el += static_cast<double>(a_ptr[k * a_stride + m]) *
+                      static_cast<double>(b_ptr[n * b_stride + k]);
+            });
+        }
+
+        c_ptr[m * c_stride + n] =
+            static_cast<Dtype>(beta * static_cast<double>(c_ptr[m * c_stride + n]) + alpha * el);
+    };
+
+    constexpr size_t iter_margin = 1'048'576; // 2^20
+    if(c_rows * c_cols * inner_loop_limit > iter_margin)
+    {
+        miopen::par_ford(c_rows, c_cols)(inner_loop);
+    }
+    else
+    {
+        miopen::ford(c_rows, c_cols)(inner_loop);
+    }
+}
+
+#endif
diff --git a/projects/miopen/miopen_utils/include/miopen_utils/network_data.hpp b/projects/miopen/miopen_utils/include/miopen_utils/network_data.hpp
new file mode 100644
index 000000000000..987d4dda9929
--- /dev/null
+++ b/projects/miopen/miopen_utils/include/miopen_utils/network_data.hpp
@@ -0,0 +1,438 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2017 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#ifndef GUARD_MIOPEN_TEST_NETWORK_DATA_HPP
+#define GUARD_MIOPEN_TEST_NETWORK_DATA_HPP
+
+#include <initializer_list>
+#include <set>
+#include <vector>
+#include <type_traits>
+
+#ifndef MIOPEN_TEST_DEFAULT_BATCH_SIZE_FACTOR
+#define MIOPEN_TEST_DEFAULT_BATCH_SIZE_FACTOR 0
+#endif
+
+template <typename T = int>
+inline constexpr T pick_batch_size(T x, T y)
+{
+    return (y == 0 || y > x) ? 1 : x / y;
+}
+
+// Reduce tests execution time
+#define MIOPEN_TESTS_GET_INPUTS_ENABLE_HUGE_TENSORS 1
+
+template <typename T = int>
+inline std::set<std::vector<T>> get_inputs(T n = MIOPEN_TEST_DEFAULT_BATCH_SIZE_FACTOR)
+{
+    // clang-format off
+    return
+    {
+        { pick_batch_size<T>(32,  n), 1,    14,  14  },
+        { pick_batch_size<T>(100, n), 1,    8,   8   },
+        { pick_batch_size<T>(256, n), 1,    27,  27  },
+#if MIOPEN_TESTS_GET_INPUTS_ENABLE_HUGE_TENSORS
+        { pick_batch_size<T>(64, n), 19,   1024,2048},
+#endif
+        { pick_batch_size<T>(100, n), 3,    32,  32  },
+        { pick_batch_size<T>(100, n), 32,   16,  16  },
+        { pick_batch_size<T>(100, n), 32,   8,   8   },
+        { pick_batch_size<T>(128, n), 256,  12,  12  },
+        { pick_batch_size<T>(128, n), 3,    231, 231 },
+        { pick_batch_size<T>(128, n), 512,  12,  12  },
+        { pick_batch_size<T>(256, n), 256,  13,  13  },
+        { pick_batch_size<T>(256, n), 3,    227, 227 },
+        { pick_batch_size<T>(256, n), 384,  13,  13  },
+        { pick_batch_size<T>(256, n), 96,   27,  27  },
+        { pick_batch_size<T>(32, n),  128,  28,  28  },
+        { pick_batch_size<T>(32, n),  144,  14,  14  },
+        { pick_batch_size<T>(32, n),  192,  28,  28  },
+        { pick_batch_size<T>(32, n),  192,  7,   7   },
+        { pick_batch_size<T>(32, n),  256,  28,  28  },
+        { pick_batch_size<T>(32, n),  3,    224, 224 },
+        { pick_batch_size<T>(32, n),  32,   28,  28  },
+        { pick_batch_size<T>(32, n),  48,   7,   7   },
+        { pick_batch_size<T>(32, n),  480,  128, 256 },
+        { pick_batch_size<T>(32, n),  480,  64,  128 },
+        { pick_batch_size<T>(32, n),  512,  4,   4   },
+        { pick_batch_size<T>(32, n),  512,  64,  128 },
+        { pick_batch_size<T>(16, n),  64,   56,  56  },
+        { pick_batch_size<T>(32, n),  832,  7,   7   },
+        { pick_batch_size<T>(64, n),  128,  56,  56  },
+        { pick_batch_size<T>(64, n),  256,  28,  28  },
+        { pick_batch_size<T>(64, n),  3,    224, 224 },
+        { pick_batch_size<T>(64, n),  512,  28,  28  },
+        { pick_batch_size<T>(64, n),  64,   112, 112 },
+        { pick_batch_size<T>(32, n),  64,   14,  14  },
+        { pick_batch_size<T>(32, n),  192,  14,  14  },
+        { pick_batch_size<T>(32, n),  320,  28,  28  },
+        { pick_batch_size<T>(32, n),  576,  14,  14  },
+        { pick_batch_size<T>(32, n),  576,  4,   4   },
+        { pick_batch_size<T>(32, n),  1056, 7,   7   },
+        { pick_batch_size<T>(32, n),  2048, 11,  11  },
+#if MIOPEN_TESTS_GET_INPUTS_ENABLE_HUGE_TENSORS
+        { pick_batch_size<T>(32, n),  16,   2048, 2048 },
+        { pick_batch_size<T>(32, n),  16,   3072, 3072 },
+        { pick_batch_size<T>(32, n),  16,   4096, 4096 },
+#endif
+        { 1,                       1,    1,   1   }
+    };
+    // clang-format on
+}
+
+template <typename T = int>
+inline std::set<std::vector<T>> get_weights(T n = MIOPEN_TEST_DEFAULT_BATCH_SIZE_FACTOR)
+{
+    // clang-format off
+    return
+    {
+        { pick_batch_size(1024, n),1024, 3,  3  },
+        { pick_batch_size(1024, n),512,  3,  3  },
+        { pick_batch_size(128, n), 256,  1,  1  },
+        { pick_batch_size(128, n), 528,  1,  1  },
+        { pick_batch_size(128, n), 96,   3,  3  },
+        { pick_batch_size(16, n),  192,  1,  1  },
+        { pick_batch_size(224, n), 112,  3,  3  },
+        { pick_batch_size(256, n), 96,   5,  5  },
+        { pick_batch_size(288, n), 144,  3,  3  },
+        { pick_batch_size(48, n),  832,  1,  1  },
+        { pick_batch_size(512, n), 256,  3,  3  },
+        { pick_batch_size(64, n),  1,    2,  2  },
+        { pick_batch_size(64, n),  3,    3,  3  },
+        { pick_batch_size(64, n),  3,    7,  7  },
+        { pick_batch_size(64, n),  32,   5,  5  },
+        { pick_batch_size(64, n),  480,  1,  1  },
+        { pick_batch_size(64, n),  64,   1,  1  },
+        { pick_batch_size(96, n),  3,    11, 11 },
+        { pick_batch_size(192, n), 64,   5,  5  },
+        { pick_batch_size(64, n),  64,   3,  3  },
+        { pick_batch_size(224, n), 224,  3,  3  },
+        { pick_batch_size(224, n), 192,  3,  3  },
+        { pick_batch_size(128, n), 320,  1,  1  },
+        { pick_batch_size(192, n), 576,  1,  1  },
+        { pick_batch_size(128, n), 1056, 1,  1  },
+        { pick_batch_size(128, n), 1024, 1,  1  },
+        { pick_batch_size(512, n), 2048, 1,  1  }
+    };
+    // clang-format on
+}
+
+template <typename T = int>
+inline std::set<std::vector<T>> get_immed_inputs(T n = MIOPEN_TEST_DEFAULT_BATCH_SIZE_FACTOR)
+{
+    // clang-format off
+    return
+    {
+        { pick_batch_size(32,  n), 1,    14,  14  },
+        { pick_batch_size(256, n), 1,    27,  27  },
+        { pick_batch_size(128, n), 512,  12,  12  },
+        { pick_batch_size(256, n), 256,  13,  13  },
+        { pick_batch_size(256, n), 3,    227, 227 },
+        { pick_batch_size(32, n),  64,   56,  56  },
+        { pick_batch_size(32, n),  96,   14,  14  },
+        { pick_batch_size(32, n),  96,   28,  28  },
+        { pick_batch_size(64, n),  128,  56,  56  },
+        { pick_batch_size(64, n),  3,    224, 224 },
+        { pick_batch_size(64, n),  256,  14,  14  },
+        { 1,                       1,    1,   1   }
+    };
+    // clang-format on
+}
+
+template <typename T = int>
+inline std::set<std::vector<T>> get_immed_weights(T n = MIOPEN_TEST_DEFAULT_BATCH_SIZE_FACTOR)
+{
+    // clang-format off
+    return
+    {
+        { pick_batch_size(208, n), 96,   3,  3  },
+        { pick_batch_size(24, n),  512,  1,  1  },
+        { pick_batch_size(256, n), 128,  3,  3  },
+        { pick_batch_size(256, n), 256,  3,  3  },
+        { pick_batch_size(256, n), 64,   5,  5  },
+        { pick_batch_size(288, n), 144,  3,  3  },
+        { pick_batch_size(96, n),  3,    11, 11 },
+        { pick_batch_size(32, n),  128,   5,  5  },
+        { pick_batch_size(32, n),  128,  1,  1  },
+        { pick_batch_size(256, n), 256,  3,  3  },
+        { pick_batch_size(512, n), 512,  3,  3  },
+        { pick_batch_size(160, n), 128,  3,  3  },
+        { pick_batch_size(32, n),  3,    7,  7  }
+    };
+    // clang-format on
+}
+
+template <typename T = int>
+inline std::set<std::vector<T>>
+get_3d_conv_input_shapes(T n = MIOPEN_TEST_DEFAULT_BATCH_SIZE_FACTOR)
+{
+    // clang-format off
+    return
+    {
+        { pick_batch_size(128, n),   1,   1,   2,   2},
+        { pick_batch_size(128, n),  64,   1,   1,   1},
+        { pick_batch_size(128, n),  64,   3,   4,   4},
+        { pick_batch_size(352, n),  32,   4,   9,   9},
+        { pick_batch_size(192, n), 512,   3,  14,  14},
+        { pick_batch_size(352, n), 512,   4,  28,  28},
+        { pick_batch_size(256, n), 512,   4,  56,  56},
+        { pick_batch_size(192, n),   3,   4, 227, 227},
+        { pick_batch_size(128, n),   4,   4, 161, 700}
+    };
+    // clang-format on
+}
+
+template <typename T = int>
+inline std::set<std::vector<T>>
+get_3d_conv_weight_shapes(T n = MIOPEN_TEST_DEFAULT_BATCH_SIZE_FACTOR)
+{
+    // clang-format off
+    return
+    {
+        { pick_batch_size( 128, n),   1,   1,   1,   1},
+        { pick_batch_size( 352, n), 128,   1,   1,   1},
+        { pick_batch_size( 256, n), 128,   1,   1,   1},
+        { pick_batch_size( 352, n),  32,   3,   3,   3},
+        { pick_batch_size( 352, n),   4,   3,   3,   3},
+        { pick_batch_size( 160, n),   4,   3,   5,   5},
+        { pick_batch_size( 128, n),  64,   5,   7,   7},
+        { pick_batch_size( 192, n),   4,   3,  11,  11},
+        { pick_batch_size( 128, n),   1,   3,   1,   7},
+        { pick_batch_size( 128, n),   1,   3,   7,   1},
+        { pick_batch_size( 128, n),   1,   3,   5,  20}
+    };
+    // clang-format on
+}
+
+template <typename T = int>
+inline std::set<std::vector<T>> get_bn_peract_inputs(T n = MIOPEN_TEST_DEFAULT_BATCH_SIZE_FACTOR)
+{
+    // clang-format off
+    return
+    {
+        { pick_batch_size(32, n),  4,    1024,2048}, //Making this much smaller
+        { pick_batch_size(100, n), 3,    32,  32  },
+        { pick_batch_size(100, n), 32,   8,   8   },
+        { pick_batch_size(128, n), 256,  12,  12  },
+        { pick_batch_size(256, n), 3,    227, 227 },
+        { pick_batch_size(64, n),  64,   112, 112 },//Batch-norm ResNet 152 after this line
+        { pick_batch_size(256, n), 1024, 14,  14  },// n is from the paper @ 256
+        { pick_batch_size(256, n), 2048, 7,   7   },
+        { pick_batch_size(256, n), 256,  56,  56  },
+        { pick_batch_size(256, n), 256,  14,  14  },
+        { pick_batch_size(256, n), 512,  28,  28  },
+        { pick_batch_size(256, n), 512,  7,   7   },
+        { pick_batch_size(256, n), 64,   112, 112 },
+        { pick_batch_size(256, n), 64,   56,  56  },//Batch-norm Inception_v3 after this
+        { pick_batch_size(32, n),  1024, 1,   1   },// n is from the paper @ 32
+        { pick_batch_size(32, n),  128,  14,  14  },
+        { pick_batch_size(32, n),  128,  28,  28  },
+        { pick_batch_size(32, n),  128,  4,   4   },
+        { pick_batch_size(32, n),  128,  7,   7   },
+        { pick_batch_size(32, n),  160,  7,   7   },
+        { pick_batch_size(32, n),  192,  14,  14  },
+        { pick_batch_size(32, n),  192,  56,  56  },
+        { pick_batch_size(32, n),  192,  7,   7   },
+        { pick_batch_size(32, n),  224,  14,  14  },
+        { pick_batch_size(32, n),  256,  7,   7   },
+        { pick_batch_size(32, n),  256,  14,  14  },
+        { pick_batch_size(32, n),  352,  7,   7   },
+        { pick_batch_size(32, n),  64,   112, 112 },
+        { pick_batch_size(32, n),  64,   14,  14  },
+        { pick_batch_size(32, n),  64,   56,  56  },
+        { pick_batch_size(32, n),  96,   28,  28  },
+        { pick_batch_size(32, n),  32,  256,  512 }, //Killing this config. Takes way too long on the CPU
+        { pick_batch_size(32, n),  256,  28,  28  },
+        { pick_batch_size(32, n),  3,    224, 224 },
+        { pick_batch_size(32, n),  480,  128, 256 },
+        { pick_batch_size(32, n),  528,  64,  128 }
+    };
+    // clang-format on
+}
+
+template <typename T = int>
+inline std::set<std::vector<T>> get_bn_spatial_inputs(T n = MIOPEN_TEST_DEFAULT_BATCH_SIZE_FACTOR)
+{
+    // clang-format off
+    return
+    {
+        { pick_batch_size(32, n),  4,    1024,2048}, //Making this much smaller
+        { pick_batch_size(32, n),  192,  256, 512 },
+        { pick_batch_size(32, n),  480,  128, 256 },
+        { pick_batch_size(256, n), 3,    227, 227 },
+        { pick_batch_size(256, n), 64,   112, 112 },
+        { pick_batch_size(512, n), 16,   32,  32  },
+        { pick_batch_size(100, n), 32,   8,   8   },
+        { pick_batch_size(128, n), 256,  12,  12  },
+        { pick_batch_size(256, n), 128,  28,  28  },
+        { pick_batch_size(256, n), 2048, 7,   7   },
+        { pick_batch_size(256, n), 256,  56,  56  },
+        { pick_batch_size(256, n), 256,  14,  14  },
+        { pick_batch_size(256, n), 512,  28,  28  },
+        { pick_batch_size(256, n), 512,  7,   7   },
+        { pick_batch_size(256, n), 64,   56,  56  },//Batch-norm Inception_v3 after this
+        { pick_batch_size(32, n),  1024, 1,   1   },// n is from the paper @ 32
+        { pick_batch_size(32, n),  128,  14,  14  },
+        { pick_batch_size(32, n),  128,  4,   4   },
+        { pick_batch_size(32, n),  160,  7,   7   },
+        { pick_batch_size(32, n),  192,  14,  14  },
+        { pick_batch_size(32, n),  192,  56,  56  },
+        { pick_batch_size(32, n),  192,  7,   7   },
+        { pick_batch_size(32, n),  224,  14,  14  },
+        { pick_batch_size(32, n),  256,  7,   7   },
+        { pick_batch_size(32, n),  352,  7,   7   },
+        { pick_batch_size(32, n),  64,   14,  14  },
+        { pick_batch_size(32, n),  64,   28,  28  },
+        { pick_batch_size(32, n),  64,   56,  56  },
+        { pick_batch_size(32, n),  96,   28,  28  },
+        { pick_batch_size(32, n),  192,  256, 512 },
+        { pick_batch_size(32, n),  256,  28,  28  },
+        { pick_batch_size(32, n),  3,    224, 224 },
+        { pick_batch_size(32, n),  480,  128, 256 },
+        { pick_batch_size(32, n),  528,  64,  128 },
+        { pick_batch_size(770, n),  1,  8,  8 },
+        { pick_batch_size(770, n),  1024,  1,  1 },
+        { pick_batch_size(152, n),  128,  80,  80 },
+        { pick_batch_size(152, n),  256,  20,  20 },
+        { pick_batch_size(152, n),  32,  160,  160 },
+        { pick_batch_size(152, n),  512,  20,  20 },
+        { pick_batch_size(152, n),  64,  160,  160 },
+        { pick_batch_size(152, n),  64,  80,  80 },
+        { pick_batch_size(256, n),  256,  20,  20 },
+        { pick_batch_size(256, n),  512,  20,  20 }
+    };
+    // clang-format on
+}
+
+template <typename T = int>
+inline std::set<std::vector<T>> get_3d_bn_peract_inputs(T n = MIOPEN_TEST_DEFAULT_BATCH_SIZE_FACTOR)
+{
+    // clang-format off
+    return
+    {
+        { pick_batch_size(32, n),   1,   32,  32,  32  },       // 32x32x32 based on VoxNet arch
+        { pick_batch_size(32, n),   1,   14,  14,  14  },
+        { pick_batch_size(32, n),  32,   14,  14,  14  },
+        { pick_batch_size(32, n),  32,   12,  12,  12  },
+        { pick_batch_size(32, n),  32,    6,   6,   6  },
+        { pick_batch_size(256, n),  1,   32,  32,  32  },      // 32x32x32 based on VoxNet arch
+        { pick_batch_size(256, n), 32,   14,  14,  14  },
+        { pick_batch_size(256, n), 32,   12,  12,  12  },
+        { pick_batch_size(256, n), 32,    6,   6,   6  },
+        { pick_batch_size(512, n),  1,   32,  32,  32  },      // 32x32x32 based on VoxNet arch
+        { pick_batch_size(512, n), 32,   14,  14,  14  },
+        { pick_batch_size(512, n), 32,   12,  12,  12  },
+        { pick_batch_size(512, n), 32,    6,   6,   6  },
+        { pick_batch_size(32, n),   2,   32,  57, 125  },       // Hand-gesture recognition CVPR 2015 paper High Res Net Path
+        { pick_batch_size(32, n),  32,   14,  25,  59  },
+        { pick_batch_size(32, n),  32,    6,  10,  27  },
+        { pick_batch_size(32, n),  32,    4,   6,  11  },
+        { pick_batch_size(32, n),  32,    2,   2,   3  },
+        { pick_batch_size(32, n),  32,   32,  28,  62  },       // Hand-gesture recognition CVPR 2015 paper Low Res Net Path
+        { pick_batch_size(32, n),  32,   14,  12,  29  },
+        { pick_batch_size(32, n),  32,    6,   4,  12  },
+        { pick_batch_size(32, n),  32,    4,   2,   2  },
+        { pick_batch_size(16, n),  32,    6,  50,  50  },       // Multi-view 3D convnet
+        { pick_batch_size(1,  n),   3,    8, 240, 320  },      // 3D convet on video
+        { pick_batch_size(1,  n),   3,   16, 240, 320  },      // 3D convet on video
+        { pick_batch_size(1,  n),   3,    8, 128, 171  },      // 3D convet on video
+        { pick_batch_size(1,  n),   3,   16, 128, 171  },      // 3D convet on video
+        { pick_batch_size(1,  n),   3,    8, 112, 112  },      // 3D convet on video
+        { pick_batch_size(1,  n),   3,   16, 112, 112  }      // 3D convet on video
+    };
+
+    // clang-format on
+}
+
+template <typename T = int>
+inline std::set<std::vector<T>>
+get_3d_bn_spatial_inputs(T n = MIOPEN_TEST_DEFAULT_BATCH_SIZE_FACTOR)
+{
+    // clang-format off
+    return
+    {
+        { pick_batch_size(32, n),   1,   32,  32,  32  },       // 32x32x32 based on VoxNet arch
+        { pick_batch_size(32, n),   1,   14,  14,  14  },
+        { pick_batch_size(32, n),  32,   14,  14,  14  },
+        { pick_batch_size(32, n),  32,   12,  12,  12  },
+        { pick_batch_size(32, n),  32,    6,   6,   6  },
+        { pick_batch_size(256, n),  1,   32,  32,  32  },      // 32x32x32 based on VoxNet arch
+        { pick_batch_size(256, n), 32,   14,  14,  14  },
+        { pick_batch_size(256, n), 32,   12,  12,  12  },
+        { pick_batch_size(256, n), 32,    6,   6,   6  },
+        { pick_batch_size(512, n),  1,   32,  32,  32  },      // 32x32x32 based on VoxNet arch
+        { pick_batch_size(512, n), 32,   14,  14,  14  },
+        { pick_batch_size(512, n), 32,   12,  12,  12  },
+        { pick_batch_size(512, n), 32,    6,   6,   6  },
+        { pick_batch_size(32,  n),  2,   32,  57, 125  },       // Hand-gesture recognition CVPR 2015 paper High Res Net Path
+        { pick_batch_size(32,  n), 32,   14,  25,  59  },
+        { pick_batch_size(32,  n), 32,    6,  10,  27  },
+        { pick_batch_size(32,  n), 32,    4,   6,  11  },
+        { pick_batch_size(32,  n), 32,    2,   2,   3  },
+        { pick_batch_size(32,  n), 32,   32,  28,  62  },       // Hand-gesture recognition CVPR 2015 paper Low Res Net Path
+        { pick_batch_size(32,  n), 32,   14,  12,  29  },
+        { pick_batch_size(32,  n), 32,    6,   4,  12  },
+        { pick_batch_size(32,  n), 32,    4,   2,   2  },
+        { pick_batch_size(16,  n), 32,    6,  50,  50  },       // Multi-view 3D convnet
+        { pick_batch_size(1,   n), 3,     8,  240, 320 },      // 3D convet on video
+        { pick_batch_size(1,   n), 3,    16,  240, 320 },      // 3D convet on video
+        { pick_batch_size(1,   n), 3,     8,  128, 171 },      // 3D convet on video
+        { pick_batch_size(1,   n), 3,    16,  128, 171 },      // 3D convet on video
+        { pick_batch_size(1,   n), 3,     8,  112, 112 },      // 3D convet on video
+        { pick_batch_size(1,   n), 3,    16,  112, 112 }      // 3D convet on video
+    };
+    // clang-format on
+}
+
+template <typename T = int>
+inline std::vector<std::vector<T>> get_sub_tensor()
+{
+    return {{16, 4, 8, 1, 4},
+            {2, 4, 8, 8, 4},
+            {16, 4, 8, 4},
+            {13, 8, 4, 8},
+            {3, 8, 7},
+            {16, 4, 10},
+            {3, 8},
+            {16, 4},
+            {4}};
+}
+
+template <typename T = int>
+inline std::vector<std::vector<T>> get_tensor_offsets()
+{
+    static_assert(std::is_signed_v<T>);
+    return {{0, 0}, {0, 2}, {4, 0}, {5, 7}};
+}
+
+template <typename T = int>
+inline std::vector<T> get_tensor_offset()
+{
+    static_assert(std::is_signed_v<T>);
+    return {0, 1, 2, 3, 4, 5};
+}
+
+#endif
diff --git a/projects/miopen/miopen_utils/include/miopen_utils/random.hpp b/projects/miopen/miopen_utils/include/miopen_utils/random.hpp
new file mode 100644
index 000000000000..63b69ac9875a
--- /dev/null
+++ b/projects/miopen/miopen_utils/include/miopen_utils/random.hpp
@@ -0,0 +1,62 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2021 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef GUARD_MIOPEN_TEST_RANDOM_HPP
+#define GUARD_MIOPEN_TEST_RANDOM_HPP
+
+#include <common_utils/random.hpp>
+
+namespace prng {
+template <typename T>
+inline T gen_descreet_uniform_sign(double scale, int32_t range)
+{
+    return static_cast<T>(scale * prng::gen_A_to_B(-range + 1, range));
+}
+
+template <typename T>
+inline T gen_descreet_unsigned(double scale, int32_t range)
+{
+    return static_cast<T>(scale * static_cast<double>(gen_0_to_B(range)));
+}
+
+} // namespace prng
+
+// lambda factory
+template <typename T, typename ScaleT, typename RangeT>
+auto uniform_signed_initializer(ScaleT scale_arg, RangeT range_arg)
+{
+    return [=](auto&&...) -> T {
+        // uniform sign give balance of both negative and positive values
+        return prng::gen_descreet_uniform_sign<T>(scale_arg, range_arg);
+    };
+}
+
+template <typename T, typename ScaleT, typename RangeT>
+auto uniform_unsigned_initializer(ScaleT scale_arg, RangeT range_arg)
+{
+    return [=](auto&&...) -> T { return prng::gen_descreet_unsigned<T>(scale_arg, range_arg); };
+}
+
+#endif // GUARD_MIOPEN_TEST_RANDOM_HPP
diff --git a/projects/miopen/miopen_utils/include/miopen_utils/rnn_util.hpp b/projects/miopen/miopen_utils/include/miopen_utils/rnn_util.hpp
new file mode 100644
index 000000000000..a6569cebb7e6
--- /dev/null
+++ b/projects/miopen/miopen_utils/include/miopen_utils/rnn_util.hpp
@@ -0,0 +1,305 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#ifndef MIOPEN_RNN_UTIL_H_
+#define MIOPEN_RNN_UTIL_H_
+
+#include <cfloat>
+#include <cmath>
+#include <initializer_list>
+#include <set>
+#include <vector>
+#include <cstdlib>
+#include <numeric>
+
+#include <miopen_utils/gemm.hpp>
+#include <miopen_utils/random.hpp>
+
+#include <miopen/tensor.hpp>
+
+// complexity O(NlogN)
+inline std::vector<int> GetReverseOrderIndex(const std::vector<int>& base_index)
+{
+    std::vector<int> reverse_index(base_index.size());
+    unsigned next_rev_index = 0;
+    for(auto id : base_index)
+        reverse_index[id] = next_rev_index++;
+    return reverse_index;
+};
+
+inline std::vector<int> GetSamplesIndexDescendingOrder(const std::vector<size_t>& unsorted_seq_lens)
+{
+    const auto sample_count = unsorted_seq_lens.size();
+
+    std::vector<int> index_v(sample_count);
+    std::iota(index_v.begin(), index_v.end(), 0);
+
+    auto seq_len_cmp = [&unsorted_seq_lens](unsigned a_id, unsigned b_id) {
+        return unsorted_seq_lens[a_id] > unsorted_seq_lens[b_id];
+    };
+
+    std::stable_sort(index_v.begin(), index_v.end(), seq_len_cmp);
+
+    return index_v;
+}
+
+template <typename Tgpu>
+inline void HiddenTensorReorder(const std::vector<Tgpu>& src_array,
+                                std::vector<Tgpu>& dst_array,
+                                const std::vector<int>& batch_order,
+                                const std::vector<size_t> hid_len,
+                                bool is_dst_direct_order)
+{
+    const size_t copy_size = hid_len[2];
+
+    const size_t batch_stride = hid_len[2];
+    const size_t layer_stride = batch_stride * hid_len[1];
+
+    for(size_t batch_id = 0; batch_id < hid_len[1]; batch_id++)
+    {
+        const auto src_batch_off =
+            batch_stride * (is_dst_direct_order ? batch_order[batch_id] : batch_id);
+        const auto dst_batch_off =
+            batch_stride * (is_dst_direct_order ? batch_id : batch_order[batch_id]);
+
+        for(size_t layer_id = 0; layer_id < hid_len[0]; layer_id++)
+        {
+            const auto dst_offset = dst_batch_off + layer_id * layer_stride;
+            const auto src_offset = src_batch_off + layer_id * layer_stride;
+
+            std::copy(src_array.begin() + src_offset,
+                      src_array.begin() + src_offset + copy_size,
+                      dst_array.begin() + dst_offset);
+        }
+    }
+}
+
+inline void createTensorDescArray(std::vector<miopen::TensorDescriptor>& td,
+                                  std::vector<miopenTensorDescriptor_t>& ptd,
+                                  const std::vector<int> bs,
+                                  const int secondDim,
+                                  miopenDataType_t dataType)
+{
+
+    std::transform(bs.begin(), bs.end(), std::back_inserter(td), [&](int x) {
+        return miopen::TensorDescriptor(
+            dataType, {static_cast<std::size_t>(x), static_cast<std::size_t>(secondDim)});
+    });
+    std::transform(td.begin(), td.end(), std::back_inserter(ptd), [](miopen::TensorDescriptor& x) {
+        return &x;
+    });
+}
+
+inline std::tuple<size_t, size_t>
+GetTempPackedBuffersSize(std::vector<int> batchs, int in_vec, int out_vec)
+{
+    size_t total_batch = std::accumulate(batchs.begin(), batchs.end(), 0ULL);
+
+    size_t in_buff_size  = total_batch * in_vec;
+    size_t out_buff_size = total_batch * out_vec;
+    return {in_buff_size, out_buff_size};
+}
+
+inline size_t getSuperTensorSize(const std::vector<int>& bs,
+                                 int seqLength,
+                                 int inputSize,
+                                 int hiddenSize,
+                                 int maxPaddingVal,
+                                 bool isBidirect,
+                                 bool isInput,
+                                 bool isPadded)
+{
+    return (isPadded //
+                ? static_cast<size_t>(seqLength) * maxPaddingVal
+                : std::accumulate(bs.begin(), bs.end(), 0ULL)) //
+           * (isInput                                          //
+                  ? static_cast<size_t>(inputSize)
+                  : static_cast<size_t>(hiddenSize) * (isBidirect ? 2 : 1));
+}
+
+template <typename Tgpu>
+void ChangeDataPadding(const std::vector<Tgpu>& src_array,
+                       std::vector<Tgpu>& dst_array,
+                       const std::vector<int>& batch_list,
+                       int max_batch,
+                       int sample_size,
+                       bool is_src_packed)
+{
+    auto seq_len = batch_list.size();
+
+    auto scr_ptr = &src_array[0];
+    auto dst_ptr = &dst_array[0];
+
+    for(int seq_id = 0; seq_id < seq_len; seq_id++)
+    {
+        auto packed_size = batch_list[seq_id] * sample_size;
+
+        std::copy(scr_ptr, scr_ptr + packed_size, dst_ptr);
+
+        if(is_src_packed)
+        {
+            dst_ptr += max_batch * sample_size;
+            scr_ptr += packed_size;
+        }
+        else
+        {
+            scr_ptr += max_batch * sample_size;
+            dst_ptr += packed_size;
+        }
+    }
+}
+
+// RNN VANILLA configs
+inline std::vector<int> get_rnn_num_layers() { return {{1, 3}}; }
+
+inline std::vector<int> get_rnn_batchSize() { return {{1, 17}}; }
+
+inline std::vector<int> get_rnn_seq_len() { return {{1, 3, 51}}; }
+
+inline std::vector<int> get_rnn_vector_len() { return {31}; }
+
+inline std::vector<int> get_rnn_hidden_size() { return {127}; }
+
+// LSTM configs
+inline std::vector<int> get_lstm_num_layers() { return {{1, 3}}; }
+
+inline std::vector<int> get_lstm_batchSize() { return {{1, 17}}; }
+
+inline std::vector<int> get_lstm_seq_len() { return {{1, 25}}; }
+
+inline std::vector<int> get_lstm_vector_len() { return {17}; }
+
+inline std::vector<int> get_lstm_hidden_size() { return {67}; }
+
+// GRU configs
+inline std::vector<int> get_gru_num_layers() { return {{1, 3}}; }
+
+inline std::vector<int> get_gru_batchSize() { return {{1, 17}}; }
+
+inline std::vector<int> get_gru_seq_len() { return {{1, 23}}; }
+
+inline std::vector<int> get_gru_vector_len() { return {13}; }
+
+inline std::vector<int> get_gru_hidden_size() { return {67}; }
+
+inline std::vector<std::vector<int>> generate_batchSeq(const int batchSize, const int seqLength)
+{
+
+    static constexpr int modval = 3;
+
+    int currentval = batchSize;
+    std::vector<int> batchSeq;
+    batchSeq.reserve(seqLength);
+    for(int i = 0; i < seqLength; i++)
+    {
+        if(i > 0)
+        {
+            int nvalue = currentval - prng::gen_0_to_B(modval);
+            currentval = (nvalue < 1) ? 1 : nvalue;
+            // printf("current value: %d\n", currentval);
+        }
+        // printf("adding a value to batch sequence: %d\n", currentval);
+        batchSeq.push_back(currentval);
+    }
+    return {batchSeq};
+}
+
+inline int sumvc(const std::vector<int>& x) { return std::accumulate(x.begin(), x.end(), 0); }
+
+template <typename T>
+inline T activfunc(T x, int actvf)
+{
+    T alpha = static_cast<T>(1), beta0 = static_cast<T>(0), beta1 = static_cast<T>(1);
+    if(actvf == 0)
+    {
+        return (x > 0) ? x : x * beta0;
+    }
+    else if(actvf == 2)
+    {
+        return static_cast<T>(1 / (1 + std::exp(-x)));
+    }
+    return static_cast<T>(alpha * std::tanh(beta1 * x));
+}
+
+template <typename T>
+inline T dervactivfunc(T x, int actvf)
+{
+    if(actvf == 0)
+    {
+        return static_cast<T>(x > 0 ? 1 : 0);
+    }
+    else if(actvf == 2)
+    {
+        return static_cast<T>(std::exp(-x) / (1 + std::exp(-x)) / (1 + std::exp(-x)));
+    }
+
+    return static_cast<T>(1 / std::cosh(x) / std::cosh(x));
+}
+
+template <typename Dtype>
+void RNN_mm_cpu_batched(const Dtype* a_ptr,
+                        size_t a_cols,
+                        size_t a_rows,
+                        size_t lda,
+                        size_t a_stride,
+                        int a_flags,
+                        const Dtype* b_ptr,
+                        size_t b_cols,
+                        size_t b_rows,
+                        size_t ldb,
+                        size_t b_stride,
+                        int b_flags,
+                        Dtype* c_ptr,
+                        size_t c_cols,
+                        size_t c_rows,
+                        size_t ldc,
+                        size_t c_stride,
+                        int batchCount,
+                        double alpha,
+                        double beta)
+{
+    for(int i = 0; i < batchCount; ++i)
+    {
+        gemm_cpu(a_ptr + a_stride * i,
+                 a_cols,
+                 a_rows,
+                 lda,
+                 a_flags == 1 ? true : false,
+                 b_ptr + b_stride * i,
+                 b_cols,
+                 b_rows,
+                 ldb,
+                 b_flags == 1 ? true : false,
+                 c_ptr + c_stride * i,
+                 c_cols,
+                 c_rows,
+                 ldc,
+                 alpha,
+                 beta);
+    }
+}
+
+#endif
diff --git a/projects/miopen/miopen_utils/include/miopen_utils/serialize.hpp b/projects/miopen/miopen_utils/include/miopen_utils/serialize.hpp
new file mode 100644
index 000000000000..71d3133df063
--- /dev/null
+++ b/projects/miopen/miopen_utils/include/miopen_utils/serialize.hpp
@@ -0,0 +1,129 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2018 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#ifndef MIOPEN_GUARD_TEST_SERIALIZE_HPP
+#define MIOPEN_GUARD_TEST_SERIALIZE_HPP
+
+#include <common_utils/rank.hpp>
+#include <common_utils/each_args.hpp>
+#include <half/half.hpp>
+#include <fstream>
+#include <string>
+#include <tuple>
+#include <vector>
+
+template <class T>
+struct is_trivial_serializable : std::is_trivially_copy_constructible<T>
+{
+};
+
+template <>
+struct is_trivial_serializable<half_float::half> : std::true_type
+{
+};
+
+template <class T>
+std::enable_if_t<is_trivial_serializable<T>{}> serialize(std::ostream& os, const T& x)
+{
+    os.write(reinterpret_cast<const char*>(&x), sizeof(T));
+}
+
+template <class T>
+auto serialize(std::ostream& os,
+               const T& x) -> decltype(x.begin(), x.end(), T(x.begin(), x.end()), void())
+{
+    std::size_t n = std::distance(x.begin(), x.end());
+    serialize(os, n);
+    for(auto&& y : x)
+        serialize(os, y);
+}
+
+template <class... Ts>
+std::enable_if_t<not is_trivial_serializable<std::tuple<Ts...>>{}>
+serialize(std::ostream& os, const std::tuple<Ts...>& t)
+{
+    miopen::unpack(
+        [&](auto&&... xs) { miopen::each_args([&](auto&& x) { serialize(os, x); }, xs...); }, t);
+}
+
+template <class T>
+std::enable_if_t<is_trivial_serializable<T>{}> serialize(std::istream& is, T& x)
+{
+    is.read(reinterpret_cast<char*>(&x), sizeof(T));
+}
+
+template <class T>
+std::enable_if_t<is_trivial_serializable<T>{}> serialize(std::istream& is, std::vector<T>& x)
+{
+    std::size_t n;
+    serialize(is, n);
+    x.resize(n);
+    is.read(reinterpret_cast<char*>(x.data()), sizeof(T) * n);
+}
+
+template <class T>
+auto serialize(std::istream& is,
+               T& x) -> decltype(x.begin(), x.end(), x.assign(x.begin(), x.end()), void())
+{
+    using value_type = std::decay_t<decltype(*x.begin())>;
+    std::size_t n;
+    serialize(is, n);
+    std::vector<value_type> v;
+    v.reserve(n);
+    for(std::size_t i = 0; i < n; i++)
+    {
+        value_type y;
+        serialize(is, y);
+        v.push_back(y);
+    }
+    x.assign(v.begin(), v.end());
+}
+
+template <class... Ts>
+std::enable_if_t<not is_trivial_serializable<std::tuple<Ts...>>{}>
+serialize(std::istream& is,
+          // cppcheck-suppress constParameter
+          std::tuple<Ts...>& t)
+{
+    miopen::unpack(
+        [&](auto&&... xs) { miopen::each_args([&](auto&& x) { serialize(is, x); }, xs...); }, t);
+}
+
+template <class T>
+void load(std::string name, T& x)
+{
+    std::ifstream is{name.c_str()};
+    serialize(is, x);
+}
+
+template <class T>
+void save(std::string name, const T& x)
+{
+    std::ofstream os{name.c_str()};
+    serialize(os, x);
+}
+
+#endif
diff --git a/projects/miopen/miopen_utils/include/miopen_utils/tensor_holder.hpp b/projects/miopen/miopen_utils/include/miopen_utils/tensor_holder.hpp
new file mode 100644
index 000000000000..f762f80f280c
--- /dev/null
+++ b/projects/miopen/miopen_utils/include/miopen_utils/tensor_holder.hpp
@@ -0,0 +1,505 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2017 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef GUARD_TENSOR_HOLDER_HPP
+#define GUARD_TENSOR_HOLDER_HPP
+
+#include <miopen_utils/network_data.hpp>
+#include <miopen/ford.hpp>
+#include <miopen/tensor.hpp>
+#include <miopen/functional.hpp>
+#include <miopen/type_name.hpp>
+#include <miopen/each_args.hpp>
+#include <miopen/bfloat16.hpp>
+#include <common_utils/random.hpp>
+
+#include <miopen_utils/serialize.hpp>
+
+#include <half/half.hpp>
+using half         = half_float::half;
+using hip_bfloat16 = bfloat16;
+#include "../../src/kernels/hip_float8.hpp"
+using float8_fnuz  = miopen_f8::hip_f8<miopen_f8::hip_f8_type::fp8>;
+using bfloat8_fnuz = miopen_f8::hip_f8<miopen_f8::hip_f8_type::bf8>;
+
+#include <iomanip>
+#include <fstream>
+
+template <class F>
+void visit_tensor_size(std::size_t n, F f)
+{
+    switch(n)
+    {
+    case 0: {
+        f(std::integral_constant<std::size_t, 0>{});
+        break;
+    }
+    case 1: {
+        f(std::integral_constant<std::size_t, 1>{});
+        break;
+    }
+    case 2: {
+        f(std::integral_constant<std::size_t, 2>{});
+        break;
+    }
+    case 3: {
+        f(std::integral_constant<std::size_t, 3>{});
+        break;
+    }
+    case 4: {
+        f(std::integral_constant<std::size_t, 4>{});
+        break;
+    }
+    case 5: {
+        f(std::integral_constant<std::size_t, 5>{});
+        break;
+    }
+    default: throw std::runtime_error("Unknown tensor size");
+    }
+}
+
+template <class T>
+struct miopen_type;
+
+template <>
+struct miopen_type<float> : std::integral_constant<miopenDataType_t, miopenFloat>
+{
+};
+
+template <>
+struct miopen_type<double> : std::integral_constant<miopenDataType_t, miopenDouble>
+{
+};
+
+template <>
+struct miopen_type<half_float::half> : std::integral_constant<miopenDataType_t, miopenHalf>
+{
+};
+template <>
+struct miopen_type<bfloat16> : std::integral_constant<miopenDataType_t, miopenBFloat16>
+{
+};
+
+template <>
+struct miopen_type<int8_t> : std::integral_constant<miopenDataType_t, miopenInt8>
+{
+};
+
+template <>
+struct miopen_type<int> : std::integral_constant<miopenDataType_t, miopenInt32>
+{
+};
+
+template <>
+struct miopen_type<int64_t> : std::integral_constant<miopenDataType_t, miopenInt64>
+{
+};
+
+template <>
+struct miopen_type<float8_fnuz> : std::integral_constant<miopenDataType_t, miopenFloat8_fnuz>
+{
+};
+
+template <>
+struct miopen_type<bfloat8_fnuz> : std::integral_constant<miopenDataType_t, miopenBFloat8_fnuz>
+{
+};
+
+template <>
+struct miopen_type<uint8_t> : std::integral_constant<miopenDataType_t, miopenInt8>
+{
+};
+
+template <>
+struct miopen_type<uint16_t> : std::integral_constant<miopenDataType_t, miopenHalf>
+{
+};
+
+template <>
+struct miopen_type<uint64_t> : std::integral_constant<miopenDataType_t, miopenInt64>
+{
+};
+
+template <class T>
+struct tensor
+{
+    using value_type = T;
+    miopen::TensorDescriptor desc;
+    std::vector<T> data;
+
+#if defined(__clang__) || defined(__GNUG__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+#endif
+
+    tensor() : desc(miopen_type<T>{}) {}
+
+#if defined(__clang__) || defined(__GNUG__)
+#pragma GCC diagnostic pop
+#endif
+
+    template <class X>
+    tensor(const std::vector<X>& dims) : desc(miopen_type<T>{}, dims), data(desc.GetElementSpace())
+    {
+    }
+
+    template <class X>
+    tensor(const std::vector<X>& dims, const std::vector<X>& strides)
+        : desc(miopen_type<T>{}, dims, strides), data(desc.GetElementSpace())
+    {
+        assert(dims.size() == strides.size());
+    }
+
+    template <class X>
+    tensor(miopenTensorLayout_t layout, const std::vector<X>& dims)
+        : desc(miopen_type<T>{}, layout, dims), data(desc.GetElementSpace())
+    {
+    }
+
+    template <class X>
+    tensor(miopenTensorLayout_t layout, const std::vector<X>& dims, const std::vector<X>& strides)
+        : desc(miopen_type<T>{}, layout, dims, strides), data(desc.GetElementSpace())
+    {
+        assert(dims.size() == strides.size());
+    }
+
+    tensor(std::size_t n, std::size_t c, std::size_t h, std::size_t w)
+        : desc(miopen_type<T>{}, {n, c, h, w}), data(n * c * h * w)
+    {
+    }
+
+    tensor(miopenTensorLayout_t layout, std::size_t n, std::size_t c, std::size_t h, std::size_t w)
+        : desc(miopen_type<T>{}, layout, {n, c, h, w}), data(desc.GetElementSpace())
+    {
+    }
+
+    tensor(std::size_t n, std::size_t c, std::size_t d, std::size_t h, std::size_t w)
+        : desc(miopen_type<T>{}, {n, c, d, h, w}), data(n * c * d * h * w)
+    {
+    }
+
+    tensor(std::size_t n) : desc(miopen_type<T>{}, {n}), data(n) {}
+
+    tensor(miopen::TensorDescriptor rhs) : desc(std::move(rhs))
+    {
+        assert(desc.GetType() == miopen_type<T>{}
+               /// In the driver, T is input tensor type, but output tensor holders
+               /// are instantiatied with T as well. This leads to false assertion
+               /// failures when T is INT8 because output type is different.
+               /// \todo Get rid of this hack when the driver is improved:
+               || (miopen_type<T>{} == miopenInt8 && desc.GetType() == miopenInt32));
+        data.resize(desc.GetElementSpace());
+    }
+
+    size_t GetDataByteSize() const { return GetSize() * sizeof(T); }
+
+    size_t GetSize() const { return desc.GetElementSpace(); }
+
+    template <class G>
+    tensor& generate(G g) &
+    {
+        if(this->desc.GetVectorLength() > 1)
+            this->generate_vect_impl(g);
+        else
+            this->generate_impl(g);
+        return *this;
+    }
+
+    template <class G>
+    tensor&& generate(G g) &&
+    {
+        if(this->desc.GetVectorLength() > 1)
+            this->generate_vect_impl(g);
+        else
+            this->generate_impl(g);
+        return std::move(*this);
+    }
+
+    template <class G>
+    void generate_impl(G g)
+    {
+        auto seed = std::accumulate(desc.GetLengths().begin(),
+                                    desc.GetLengths().end(),
+                                    std::size_t{521288629},
+                                    [](auto x, auto y) {
+                                        x ^= x << 1U;
+                                        return x ^ y;
+                                    });
+        seed ^= data.size();
+        seed ^= desc.GetLengths().size();
+        prng::reset_seed(seed);
+        auto iterator = data.begin();
+        auto assign   = [&](T x) {
+            *iterator = x;
+            ++iterator;
+        };
+        this->for_each(
+            miopen::compose(miopen::compose(assign, miopen::cast_to<T>()), std::move(g)));
+    }
+
+    template <class G>
+    void generate_vect_impl(G g)
+    {
+        auto seed = std::accumulate(desc.GetLengths().begin(),
+                                    desc.GetLengths().end(),
+                                    std::size_t{521288629},
+                                    [](auto x, auto y) {
+                                        x ^= x << 1U;
+                                        return x ^ y;
+                                    });
+        seed ^= data.size();
+        seed ^= desc.GetLengths().size();
+        prng::reset_seed(seed);
+        auto iterator     = data.begin();
+        auto vectorLength = desc.GetVectorLength();
+        auto assign       = [&](T x) {
+            assert(iterator < data.end());
+            // for debugging
+            for(auto i = 0; i < vectorLength; i++)
+            {
+                *(iterator + i) = x;
+            }
+            iterator += vectorLength;
+        };
+        this->for_each(
+            miopen::compose(miopen::compose(assign, miopen::cast_to<T>()), std::move(g)));
+    }
+
+    template <class Loop, class F>
+    struct for_each_unpacked
+    {
+        Loop loop;
+        F f;
+        template <class... Ts>
+        auto operator()(Ts... xs) const -> decltype(f(xs...), void())
+        {
+            loop(xs...)(std::move(f));
+        }
+
+        struct any
+        {
+            any() {}
+            template <class X>
+            any(X)
+            {
+            }
+        };
+
+        [[noreturn]] void operator()(any = {},
+                                     any = {},
+                                     any = {},
+                                     any = {},
+                                     any = {},
+                                     any = {},
+                                     any = {},
+                                     any = {},
+                                     any = {}) const
+        {
+            throw std::runtime_error(
+                "Arguments to for_each do not match tensor size or the function " +
+                miopen::get_type_name<F>() + " can not be called.");
+        }
+    };
+
+    struct for_each_handler
+    {
+        template <class Self, class Loop, class F, class Size>
+        void operator()(Self* self, Loop loop, F f, Size size) const
+        {
+            auto dims = miopen::tien<size>(self->desc.GetLengths());
+            miopen::unpack(for_each_unpacked<Loop, F>{loop, std::move(f)}, dims);
+        }
+    };
+
+    template <class F>
+    void for_each(F f) const
+    {
+        visit_tensor_size(
+            desc.GetLengths().size(),
+            std::bind(for_each_handler{}, this, miopen::ford, std::move(f), std::placeholders::_1));
+    }
+
+    template <class F>
+    void par_for_each(F f) const
+    {
+        visit_tensor_size(
+            desc.GetLengths().size(),
+            std::bind(
+                for_each_handler{}, this, miopen::par_ford, std::move(f), std::placeholders::_1));
+    }
+
+    template <class... Ts>
+    T& operator()(Ts... xs)
+    {
+        assert(this->desc.GetIndex(xs...) < data.size());
+        return this->data[this->desc.GetIndex(xs...)];
+    }
+
+    template <class... Ts>
+    const T& operator()(Ts... xs) const
+    {
+        assert(this->desc.GetIndex(xs...) < data.size());
+        return this->data[this->desc.GetIndex(xs...)];
+    }
+
+    template <class Integer, Integer N>
+    const T& operator()(const std::array<Integer, N>& multi_id) const
+    {
+        auto f = [&](auto... is) { return this->desc.GetIndex(is...); };
+        assert(miopen::unpack(f, multi_id) < data.size());
+        return this->data[miopen::unpack(f, multi_id)];
+    }
+
+    T& operator[](std::size_t i) { return data.at(i); }
+
+    const T& operator[](std::size_t i) const { return data.at(i); }
+
+    typename std::vector<T>::iterator begin() { return data.begin(); }
+
+    typename std::vector<T>::iterator end() { return data.end(); }
+
+    typename std::vector<T>::const_iterator begin() const { return data.begin(); }
+
+    typename std::vector<T>::const_iterator end() const { return data.end(); }
+
+    friend std::ostream& operator<<(std::ostream& stream, const tensor& t)
+    {
+        return stream << t.desc;
+    }
+
+    template <size_t N, typename Stream>
+    void dump_inner(size_t dim, std::array<size_t, N>& coord, Stream& stream) const
+    {
+        const auto lengths = this->desc.GetLengths();
+        if(lengths.size() == 0)
+        {
+            // 0D special case: Just print the one value that we have and return.
+            stream << (*this)(coord);
+        }
+        else if(dim + 1 == lengths.size())
+        {
+            // 1D special case: dump everything on one line
+            for(size_t i = 0; i < lengths[dim]; ++i)
+            {
+                if(i != 0)
+                    stream << ' ';
+
+                coord[dim] = i;
+                stream << std::setw(4) << (*this)(coord);
+            }
+
+            stream << '\n';
+        }
+        else
+        {
+            if(dim + 2 == lengths.size())
+            {
+                // 2D special case: Also print which 2D slice we are currently printing
+                // Note: this is not needed for higher dimensions, as they will also pass
+                // through this branch.
+                stream << "slice [";
+                for(size_t i = 0; i < dim; ++i)
+                {
+                    stream << coord[i] << ", ";
+                }
+                stream << ":, :]\n";
+            }
+
+            for(size_t i = 0; i < lengths[dim]; ++i)
+            {
+                coord[dim] = i;
+                this->dump_inner<N>(dim + 1, coord, stream);
+            }
+        }
+    }
+
+    template <typename Stream = decltype(std::cout)>
+    void dump(const char* name, Stream& stream = std::cout) const
+    {
+        const auto n = this->desc.GetLengths().size();
+        stream << "==== " << name << ": " << *this << n << '\n';
+        stream.fill(' ');
+
+        const auto flags = stream.flags();
+
+        visit_tensor_size(n, [&](const auto size) {
+            constexpr size_t N = decltype(size)::value;
+            std::array<size_t, N> coord;
+            this->dump_inner<N>(0, coord, stream);
+        });
+
+        stream.flags(flags);
+    }
+};
+
+template <class T>
+void serialize(std::istream& s, tensor<T>& x)
+{
+    std::vector<std::size_t> lens;
+    serialize(s, lens);
+    std::vector<std::size_t> strides;
+    serialize(s, strides);
+    x.desc = miopen::TensorDescriptor{miopen_type<T>{}, lens, strides};
+    serialize(s, x.data);
+}
+
+template <class T>
+void serialize(std::ostream& s, const tensor<T>& x)
+{
+    const auto& lens    = x.desc.GetLengths();
+    const auto& strides = x.desc.GetStrides();
+    serialize(s, lens);
+    serialize(s, strides);
+    serialize(s, x.data);
+}
+
+struct tensor_generate
+{
+    template <class Tensor, class G>
+    Tensor&& operator()(Tensor&& t, G g) const
+    {
+        return std::forward<Tensor>(t.generate(g));
+    }
+};
+
+struct tensor_elem_gen_integer
+{
+    uint64_t max_value = 17;
+
+    template <class... Ts>
+    double operator()(Ts... Xs) const
+    {
+        static_assert(sizeof...(Ts) < 6,
+                      "Dimensions in tensor_elem_gen_integer must be less than 6.");
+        assert(max_value > 0);
+        std::array<uint64_t, sizeof...(Ts)> left = {{Xs...}};
+        std::array<uint64_t, 5> right            = {{613, 547, 701, 877, 1049}};
+        uint64_t dot =
+            std::inner_product(left.begin(), left.end(), right.begin(), static_cast<uint64_t>(173));
+        return static_cast<double>(dot % max_value);
+    }
+};
+
+#endif
diff --git a/projects/miopen/miopen_utils/include/miopen_utils/verify.hpp b/projects/miopen/miopen_utils/include/miopen_utils/verify.hpp
new file mode 100644
index 000000000000..81af2afbcf2d
--- /dev/null
+++ b/projects/miopen/miopen_utils/include/miopen_utils/verify.hpp
@@ -0,0 +1,245 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2017 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef GUARD_VERIFY_HPP
+#define GUARD_VERIFY_HPP
+
+#include <algorithm>
+#include <cmath>
+#include <functional>
+#include <iostream>
+#include <miopen/float_equal.hpp>
+#include <miopen/returns.hpp>
+#include <numeric>
+#include <miopen/bfloat16.hpp>
+using half         = half_float::half;
+using hip_bfloat16 = bfloat16;
+#include <hip_float8.hpp>
+#include <miopen_utils/tensor_holder.hpp>
+
+namespace miopen {
+
+// Compute the value of a range
+template <class R>
+using range_value = typename std::decay<decltype(*std::declval<R>().begin())>::type;
+
+struct sum_fn
+{
+    template <class T, class U>
+    auto operator()(T x, U y) const MIOPEN_RETURNS(x + y);
+};
+static constexpr sum_fn sum{};
+
+struct max_fn
+{
+    template <class T>
+    static T id(T x)
+    {
+        return x;
+    }
+
+    template <class T, class U>
+    auto operator()(T x, U y) const MIOPEN_RETURNS(max_fn::id(x > y ? x : y));
+};
+static constexpr max_fn max{};
+
+namespace abs_diff_detail {
+using std::fabs;
+struct fn
+{
+    template <class T, class U>
+    auto operator()(T x, U y) const MIOPEN_RETURNS(fabs(x - y));
+};
+
+} // namespace abs_diff_detail
+
+static constexpr abs_diff_detail::fn abs_diff{};
+
+struct not_finite_fn
+{
+    template <class T, typename std::enable_if<(std::is_floating_point_v<T>), bool>::type = false>
+    bool operator()(T x) const
+    {
+        return !std::isfinite(x);
+    }
+
+    template <class T,
+              typename std::enable_if<
+                  (std::is_same_v<typename std::remove_cv<T>::type, half_float::half>),
+                  bool>::type = false>
+    bool operator()(T x) const
+    {
+        return !half_float::isfinite(x);
+    }
+
+    template <class T,
+              typename std::enable_if<(std::is_same_v<typename std::remove_cv<T>::type, bfloat16>),
+                                      bool>::type = false>
+    bool operator()(T x) const
+    {
+        return !std::isfinite(x); // bfloat16 has float() conversion operator
+    }
+
+    template <class T, typename std::enable_if<(std::is_integral_v<T>), bool>::type = false>
+    bool operator()(T x) const
+    {
+        std::ignore = x;
+        return false;
+    }
+};
+static constexpr not_finite_fn not_finite{};
+
+template <class T, class U>
+T as(T, U x)
+{
+    return x;
+}
+
+struct compare_mag_fn
+{
+    template <class T, class U>
+    bool operator()(T x, U y) const
+    {
+        using std::fabs;
+        return fabs(x) < fabs(y);
+    }
+};
+static constexpr compare_mag_fn compare_mag{};
+
+struct square_diff_fn
+{
+    template <class T, class U>
+    double operator()(T x, U y) const
+    {
+        double diff = static_cast<double>(x - y);
+        return diff * diff;
+    }
+};
+static constexpr square_diff_fn square_diff{};
+
+template <class T, std::enable_if_t<!std::is_floating_point_v<T>, bool> = true>
+bool equal_values(T const& lhs, T const& rhs)
+{
+    return lhs == rhs;
+}
+
+template <class T, std::enable_if_t<std::is_floating_point_v<T>, bool> = true>
+bool equal_values(T const& lhs, T const& rhs)
+{
+    return miopen::float_equal_sentinel(lhs, rhs);
+}
+
+template <class R1>
+bool range_empty(R1&& r1)
+{
+    return r1.begin() == r1.end();
+}
+
+template <class R1>
+auto range_distance(R1&& r1) MIOPEN_RETURNS(std::distance(r1.begin(), r1.end()));
+
+template <class T>
+bool range_zero(const std::vector<T>& r)
+{
+    return std::all_of(r.begin(), r.end(), [](T x) { return equal_values(x, T()); });
+}
+
+template <class T>
+bool range_zero(const tensor<T>& r)
+{
+    return range_zero(r.data);
+}
+
+template <class R1, class R2, class T, class Reducer, class Product>
+T range_product(R1&& r1, R2&& r2, T state, Reducer r, Product p)
+{
+    return std::inner_product(r1.begin(), r1.end(), r2.begin(), state, r, p);
+}
+
+template <class R1, class R2, class Compare>
+std::size_t mismatch_idx(R1&& r1, R2&& r2, Compare compare)
+{
+    auto p = std::mismatch(r1.begin(), r1.end(), r2.begin(), compare);
+    return std::distance(r1.begin(), p.first);
+}
+
+template <class R1, class Predicate>
+int64_t find_idx(R1&& r1, Predicate p)
+{
+    auto it = std::find_if(r1.begin(), r1.end(), p);
+    if(it == r1.end())
+        return -1;
+    else
+        return std::distance(r1.begin(), it);
+}
+
+template <class R1, class R2>
+double max_diff(R1&& r1, R2&& r2)
+{
+    return range_product(r1, r2, 0.0, max, abs_diff);
+}
+
+template <class R1, class R2>
+auto max_diff_v2(R1&& r1, R2&& r2)
+{
+    using T            = decltype(r1[0] - r2[0]);
+    auto abs_diff_func = [](auto x, auto y) { return x > y ? x - y : y - x; };
+    // BUG: deduced wrong datatype, half_float bug
+    if constexpr(std::is_same_v<T, half_float::detail::expr>)
+        return range_product(r1, r2, half_float::half(), max, abs_diff_func);
+    else
+        return range_product(r1, r2, T(), max, abs_diff_func);
+}
+
+template <class R1, class R2, class T>
+std::size_t mismatch_diff(R1&& r1, R2&& r2, T diff)
+{
+    return mismatch_idx(
+        r1,
+        r2,
+        std::bind(
+            float_equal, diff, std::bind(abs_diff, std::placeholders::_1, std::placeholders::_2)));
+}
+
+template <class R1, class R2>
+double rms_range(R1&& r1, R2&& r2)
+{
+    std::size_t n = range_distance(r1);
+    if(n == range_distance(r2))
+    {
+        if(n == 0)
+            return 0;
+        double square_difference = range_product(r1, r2, 0.0, sum_fn{}, square_diff);
+        double mag1 = static_cast<double>(*std::max_element(r1.begin(), r1.end(), compare_mag));
+        double mag2 = static_cast<double>(*std::max_element(r2.begin(), r2.end(), compare_mag));
+        double mag =
+            std::max({std::fabs(mag1), std::fabs(mag2), std::numeric_limits<double>::min()});
+        return std::sqrt(square_difference) / (std::sqrt(n) * mag);
+    }
+    else
+        return double(std::numeric_limits<range_value<R1>>::max());
+}
+} // namespace miopen
+#endif
diff --git a/projects/miopen/test/CMakeLists.txt b/projects/miopen/test/CMakeLists.txt
index bef91d0ea871..035f1314fc63 100755
--- a/projects/miopen/test/CMakeLists.txt
+++ b/projects/miopen/test/CMakeLists.txt
@@ -414,9 +414,9 @@ function(add_test_executable TEST_NAME)
     endif()
     # MIOpen_with_plugins ensures CK plugin .so's are built alongside the test
     if(NOT MIOPEN_EMBED_DB STREQUAL "")
-        target_link_libraries(${TEST_NAME} MIOpen_with_plugins miopen_data miopen_common_utils)
+        target_link_libraries(${TEST_NAME} MIOpen_with_plugins miopen_data miopen_common_utils miopen_utils)
     else()
-        target_link_libraries(${TEST_NAME} MIOpen_with_plugins miopen_common_utils)
+        target_link_libraries(${TEST_NAME} MIOpen_with_plugins miopen_common_utils miopen_utils)
     endif()
     target_include_directories(${TEST_NAME} PRIVATE ../src/kernels)
     if(WIN32)
diff --git a/projects/miopen/test/cpu_bias.hpp b/projects/miopen/test/cpu_bias.hpp
index 9b0c2578feef..4b150035d5c0 100644
--- a/projects/miopen/test/cpu_bias.hpp
+++ b/projects/miopen/test/cpu_bias.hpp
@@ -1,141 +1,5 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2019 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
+// Forwarding header — implementation moved to miopen_utils.
 #ifndef GUARD_CPU_BIAS_HPP
 #define GUARD_CPU_BIAS_HPP
-
-#include "test.hpp"
-#include <array>
-#include <iostream>
-#include <iterator>
-#include <limits>
-#include <memory>
-#include <miopen/miopen.h>
-#include <miopen/tensor.hpp>
-#include <utility>
-
-#include "tensor_holder.hpp"
-#include <miopen/stringutils.hpp>
-#include <miopen/functional.hpp>
-
-template <std::size_t NSpatialDim, typename Tout, typename Tbias>
-void cpu_bias_forward_impl(tensor<Tout>& out, const tensor<Tbias>& bias)
-{
-    assert(out.desc.GetNumDims() == NSpatialDim + 2 and bias.desc.GetNumDims() == NSpatialDim + 2);
-    assert(
-        bias.desc.GetLengths()[0] == 1 && bias.desc.GetLengths()[1] == out.desc.GetLengths()[1] &&
-        std::all_of(bias.desc.GetLengths().begin() + 2, bias.desc.GetLengths().end(), [](auto v) {
-            return v == 1;
-        }));
-
-    out.par_for_each([&](auto out_n_id, auto out_k_id, auto... out_spatial_id_pack) {
-        out(out_n_id, out_k_id, out_spatial_id_pack...) =
-            double(out(out_n_id, out_k_id, out_spatial_id_pack...)) + double(bias.data[out_k_id]);
-    });
-}
-
-template <std::size_t NSpatialDim, typename Tout, typename Tbias>
-void cpu_bias_backward_data_impl(const tensor<Tout>& out, tensor<Tbias>& bias)
-{
-    assert(out.desc.GetNumDims() == NSpatialDim + 2 and bias.desc.GetNumDims() == NSpatialDim + 2);
-    assert(
-        bias.desc.GetLengths()[0] == 1 && bias.desc.GetLengths()[1] == out.desc.GetLengths()[0] &&
-        std::all_of(bias.desc.GetLengths().begin() + 2, bias.desc.GetLengths().end(), [](auto v) {
-            return v == 1;
-        }));
-
-    std::size_t out_n_len = out.desc.GetLengths()[0];
-    std::size_t out_k_len = out.desc.GetLengths()[1];
-
-    std::array<std::size_t, NSpatialDim> out_spatial_len{};
-    std::copy_n(out.desc.GetLengths().begin() + 2, NSpatialDim, out_spatial_len.begin());
-
-    miopen::par_ford(out_k_len)([&](auto out_k_id) {
-        auto ford_out_n_spatial =
-            miopen::unpacker(miopen::prepender(miopen::ford, out_n_len))(out_spatial_len);
-
-        double acc = 0;
-        ford_out_n_spatial([&](auto out_n_id, auto... out_spatial_id_pack) {
-            acc += double(out(out_n_id, out_k_id, out_spatial_id_pack...));
-        });
-
-        bias.data[out_k_id] = acc;
-    });
-}
-
-template <typename Tout, typename Tbias>
-void cpu_bias_forward(tensor<Tout>& out, const tensor<Tbias>& bias)
-{
-    switch(out.desc.GetNumDims())
-    {
-    case 3: {
-        cpu_bias_forward_impl<1>(out, bias);
-        break;
-    }
-    case 4: {
-        cpu_bias_forward_impl<2>(out, bias);
-        break;
-    }
-    case 5: {
-        cpu_bias_forward_impl<3>(out, bias);
-        break;
-    }
-    case 6: {
-        cpu_bias_forward_impl<4>(out, bias);
-        break;
-    }
-    default: {
-        MIOPEN_THROW("not belong to any case");
-    }
-    }
-}
-
-template <typename Tout, typename Tbias>
-void cpu_bias_backward_data(const tensor<Tout>& out, tensor<Tbias>& bias)
-{
-    switch(out.desc.GetNumDims())
-    {
-    case 3: {
-        cpu_bias_backward_data_impl<1>(out, bias);
-        break;
-    }
-    case 4: {
-        cpu_bias_backward_data_impl<2>(out, bias);
-        break;
-    }
-    case 5: {
-        cpu_bias_backward_data_impl<3>(out, bias);
-        break;
-    }
-    case 6: {
-        cpu_bias_backward_data_impl<4>(out, bias);
-        break;
-    }
-    default: {
-        MIOPEN_THROW("not belong to any case");
-    }
-    }
-}
+#include <miopen_utils/cpu_bias.hpp>
 #endif
diff --git a/projects/miopen/test/cpu_conv.hpp b/projects/miopen/test/cpu_conv.hpp
index 895262311b12..fac5227efe75 100644
--- a/projects/miopen/test/cpu_conv.hpp
+++ b/projects/miopen/test/cpu_conv.hpp
@@ -1,515 +1,5 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2019 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
+// Forwarding header — implementation moved to miopen_utils.
 #ifndef GUARD_CPU_CONV_HPP
 #define GUARD_CPU_CONV_HPP
-
-#include "test.hpp"
-#include <array>
-#include <iostream>
-#include <iterator>
-#include <limits>
-#include <memory>
-#include <miopen/miopen.h>
-#include <miopen/tensor.hpp>
-#include <utility>
-
-#include "tensor_holder.hpp"
-#include <miopen/stringutils.hpp>
-#include <miopen/functional.hpp>
-#include <hip_float8.hpp>
-
-template <class T, class... Ts>
-static constexpr auto make_array(T x, Ts... xs)
-{
-    return std::array<T, 1 + sizeof...(Ts)>{{x, xs...}};
-}
-
-template <typename T>
-struct PassThru
-{
-    T operator()(T t) { return t; }
-};
-
-template <typename Tin, typename Twei, typename Tout>
-struct cpu_convolution_acc_type
-{
-    using type = double; // default using double as accumulator
-};
-
-template <>
-struct cpu_convolution_acc_type<int8_t, int8_t, int32_t>
-{
-    using type = int32_t;
-};
-
-template <>
-struct cpu_convolution_acc_type<int8_t, int8_t, float>
-{
-    using type = double;
-};
-
-template <std::size_t ConvDim,
-          typename Tacc,
-          typename FI,
-          typename FW,
-          typename Tin,
-          typename Twei,
-          typename Tout,
-          typename Range>
-void cpu_convolution_forward_impl(const tensor<Tin>& in,
-                                  const tensor<Twei>& wei,
-                                  tensor<Tout>& out,
-                                  const Range& pads,
-                                  const Range& strides,
-                                  const Range& dilations,
-                                  std::size_t group_count,
-                                  FI fi = {},
-                                  FW fw = {})
-{
-    static_assert(ConvDim > 0, "wrong! convolution dim should be larger than 0");
-    assert(in.desc.GetNumDims() == ConvDim + 2 and wei.desc.GetNumDims() == ConvDim + 2 and
-           out.desc.GetNumDims() == ConvDim + 2 and pads.size() == ConvDim and
-           strides.size() == ConvDim and dilations.size() == ConvDim);
-    std::size_t out_n_len = out.desc.GetLengths()[0];
-
-    std::size_t wei_k_len = wei.desc.GetLengths()[0];
-    std::size_t wei_c_len = wei.desc.GetLengths()[1];
-
-    std::size_t vector_len = in.desc.GetVectorLength();
-
-    std::array<std::size_t, ConvDim> in_spatial_len{};
-    std::array<std::size_t, ConvDim> wei_spatial_len{};
-    std::array<std::size_t, ConvDim> out_spatial_len{};
-
-    std::copy_n(in.desc.GetLengths().begin() + 2, ConvDim, in_spatial_len.begin());
-    std::copy_n(wei.desc.GetLengths().begin() + 2, ConvDim, wei_spatial_len.begin());
-    std::copy_n(out.desc.GetLengths().begin() + 2, ConvDim, out_spatial_len.begin());
-
-    if(wei.desc.GetLayout_str() == "CHWNc")
-    {
-        wei_c_len = wei.desc.GetLengths()[0];
-        std::copy_n(wei.desc.GetLengths().begin() + 1, ConvDim, wei_spatial_len.begin());
-        wei_k_len = wei.desc.GetLengths()[3];
-    }
-
-    std::size_t wei_k_len_per_group = wei_k_len / group_count;
-
-    // f(x0, x1, xs...)
-    // f1(xs...) = f(x0, x1, xs...)
-    // f2(xs_array) = f1(xs...)
-    auto par_ford_out_nk_spatial = miopen::unpacker(
-        miopen::prepender(miopen::par_ford, out_n_len, wei_k_len))(out_spatial_len);
-
-    par_ford_out_nk_spatial([&](std::size_t out_n_id,
-                                std::size_t out_k_id,
-                                auto... out_spatial_id_pack) {
-        auto out_spatial_id = make_array(out_spatial_id_pack...);
-
-        std::size_t group_id = out_k_id / wei_k_len_per_group;
-        Tacc acc             = 0;
-
-        miopen::ford(wei_c_len)([&](std::size_t wei_c_id) {
-            std::size_t in_c_id = group_id * wei_c_len + wei_c_id;
-
-            auto ford_wei_spatial = miopen::unpacker(miopen::ford)(wei_spatial_len);
-
-            ford_wei_spatial([&](auto... wei_spatial_id_pack) {
-                auto wei_spatial_id = make_array(wei_spatial_id_pack...);
-
-                std::array<std::ptrdiff_t, ConvDim> in_spatial_id{};
-
-                for(std::size_t i = 0; i < ConvDim; ++i)
-                {
-                    in_spatial_id[i] =
-                        out_spatial_id[i] * strides[i] + wei_spatial_id[i] * dilations[i] - pads[i];
-                }
-                bool out_of_bound = false;
-                for(std::size_t i = 0; i < ConvDim; ++i)
-                {
-                    out_of_bound = out_of_bound or
-                                   (in_spatial_id[i] < 0 or in_spatial_id[i] >= in_spatial_len[i]);
-                }
-                if(!out_of_bound)
-                {
-                    if(vector_len > 1)
-                    {
-                        std::array<std::size_t, ConvDim + 3> in_id{};
-                        in_id[1] = out_n_id;
-                        in_id[2] = in_c_id;
-                        std::copy_n(in_spatial_id.begin(), ConvDim, in_id.begin() + 3);
-                        for(std::size_t i = 0; i < vector_len; i++)
-                        {
-                            in_id[0] = i;
-                            acc += Tacc(in(in_id)) *
-                                   Tacc(wei(i, out_k_id, wei_c_id, wei_spatial_id_pack...));
-                        }
-                    }
-                    else
-                    {
-                        std::array<std::size_t, ConvDim + 2> in_id{};
-                        in_id[0] = out_n_id;
-                        in_id[1] = in_c_id;
-                        std::copy_n(in_spatial_id.begin(), ConvDim, in_id.begin() + 2);
-                        Tacc tmp1 = static_cast<Tacc>(fi(in(in_id)));
-                        Tacc tmp2 =
-                            static_cast<Tacc>(fw(wei(out_k_id, wei_c_id, wei_spatial_id_pack...)));
-                        acc += tmp1 * tmp2;
-                    }
-                }
-            });
-        });
-        if(vector_len > 1)
-        {
-            out(out_k_id % vector_len, out_n_id, out_k_id / vector_len, out_spatial_id_pack...) =
-                static_cast<Tout>(acc);
-        }
-        else
-        {
-            out(out_n_id, out_k_id, out_spatial_id_pack...) = static_cast<Tout>(acc);
-        }
-    });
-}
-
-template <std::size_t ConvDim,
-          typename Tacc,
-          typename FW,
-          typename FO,
-          typename Tin,
-          typename Twei,
-          typename Tout,
-          typename Range>
-void cpu_convolution_backward_data_impl(tensor<Tin>& in,
-                                        const tensor<Twei>& wei,
-                                        const tensor<Tout>& out,
-                                        const Range& pads,
-                                        const Range& strides,
-                                        const Range& dilations,
-                                        std::size_t group_count,
-                                        FW fw = {},
-                                        FO fo = {})
-{
-    static_assert(ConvDim > 0, "wrong! convolution dim should be larger than 0");
-    assert(in.desc.GetNumDims() == ConvDim + 2 and wei.desc.GetNumDims() == ConvDim + 2 and
-           out.desc.GetNumDims() == ConvDim + 2 and pads.size() == ConvDim and
-           strides.size() == ConvDim and dilations.size() == ConvDim);
-
-    std::size_t in_n_len = in.desc.GetLengths()[0];
-    std::size_t in_c_len = in.desc.GetLengths()[1];
-
-    std::size_t wei_k_len = wei.desc.GetLengths()[0];
-    std::size_t wei_c_len = wei.desc.GetLengths()[1];
-
-    std::size_t wei_k_len_per_group = wei_k_len / group_count;
-
-    std::array<std::size_t, ConvDim> in_spatial_len{};
-    std::array<std::size_t, ConvDim> wei_spatial_len{};
-    std::array<std::size_t, ConvDim> out_spatial_len{};
-
-    std::copy_n(in.desc.GetLengths().begin() + 2, ConvDim, in_spatial_len.begin());
-    std::copy_n(wei.desc.GetLengths().begin() + 2, ConvDim, wei_spatial_len.begin());
-    std::copy_n(out.desc.GetLengths().begin() + 2, ConvDim, out_spatial_len.begin());
-
-    auto par_ford_in_nc_spatial =
-        miopen::unpacker(miopen::prepender(miopen::par_ford, in_n_len, in_c_len))(in_spatial_len);
-
-    par_ford_in_nc_spatial(
-        [&](std::size_t in_n_id, std::size_t in_c_id, auto... in_spatial_id_pack) {
-            auto in_spatial_id = make_array(in_spatial_id_pack...);
-
-            std::size_t group_id = in_c_id / wei_c_len;
-
-            Tacc acc = 0;
-
-            miopen::ford(wei_k_len_per_group)([&](std::size_t wei_k_id_inside_group) {
-                auto ford_wei_spatial = miopen::unpacker(miopen::ford)(wei_spatial_len);
-
-                ford_wei_spatial([&](auto... wei_spatial_id_pack) {
-                    auto wei_spatial_id = make_array(wei_spatial_id_pack...);
-
-                    std::array<ptrdiff_t, ConvDim> out_spatial_id_{};
-                    std::array<ptrdiff_t, ConvDim> out_spatial_id{};
-
-                    for(std::size_t i = 0; i < ConvDim; ++i)
-                    {
-                        out_spatial_id_[i] =
-                            pads[i] + in_spatial_id[i] - wei_spatial_id[i] * dilations[i];
-                        out_spatial_id[i] = out_spatial_id_[i] / strides[i];
-                    }
-
-                    bool use = true;
-                    for(std::size_t i = 0; i < ConvDim; ++i)
-                    {
-                        use &= out_spatial_id_[i] % strides[i] == 0 and out_spatial_id[i] >= 0 and
-                               out_spatial_id[i] < out_spatial_len[i];
-                    }
-
-                    if(use)
-                    {
-                        std::size_t out_k_id =
-                            group_id * wei_k_len_per_group + wei_k_id_inside_group;
-                        std::size_t wei_c_id = in_c_id % wei_c_len;
-
-                        std::array<std::size_t, ConvDim + 2> out_id{};
-                        out_id[0] = in_n_id;
-                        out_id[1] = out_k_id;
-                        std::copy_n(out_spatial_id.begin(), ConvDim, out_id.begin() + 2);
-                        Tacc tmp1 = fo(out(out_id));
-                        Tacc tmp2 = fw(wei(out_k_id, wei_c_id, wei_spatial_id_pack...));
-                        acc += tmp1 * tmp2;
-                    }
-                });
-            });
-            // TODO: Why do we need a no-lint here ?
-            in(in_n_id, in_c_id, in_spatial_id_pack...) = static_cast<Tout>(acc); // NOLINT
-        });
-}
-
-template <std::size_t ConvDim,
-          typename Tacc,
-          typename FI,
-          typename FO,
-          typename Tin,
-          typename Twei,
-          typename Tout,
-          typename Range>
-void cpu_convolution_backward_weight_impl(const tensor<Tin>& in,
-                                          tensor<Twei>& wei,
-                                          const tensor<Tout>& out,
-                                          const Range& pads,
-                                          const Range& strides,
-                                          const Range& dilations,
-                                          std::size_t group_count,
-                                          FI fi,
-                                          FO fo)
-{
-    static_assert(ConvDim > 0, "wrong! convolution dim should be larger than 0");
-    assert(in.desc.GetNumDims() == ConvDim + 2 and wei.desc.GetNumDims() == ConvDim + 2 and
-           out.desc.GetNumDims() == ConvDim + 2 and pads.size() == ConvDim and
-           strides.size() == ConvDim and dilations.size() == ConvDim);
-
-    std::size_t out_n_len = out.desc.GetLengths()[0];
-
-    std::size_t wei_k_len = wei.desc.GetLengths()[0];
-    std::size_t wei_c_len = wei.desc.GetLengths()[1];
-
-    std::size_t wei_k_len_per_group = wei_k_len / group_count;
-
-    std::array<std::size_t, ConvDim> in_spatial_len{};
-    std::array<std::size_t, ConvDim> wei_spatial_len{};
-    std::array<std::size_t, ConvDim> out_spatial_len{};
-
-    std::copy_n(in.desc.GetLengths().begin() + 2, ConvDim, in_spatial_len.begin());
-    std::copy_n(wei.desc.GetLengths().begin() + 2, ConvDim, wei_spatial_len.begin());
-    std::copy_n(out.desc.GetLengths().begin() + 2, ConvDim, out_spatial_len.begin());
-
-    auto par_ford_wei_kc_spatial = miopen::unpacker(
-        miopen::prepender(miopen::par_ford, wei_k_len, wei_c_len))(wei_spatial_len);
-
-    par_ford_wei_kc_spatial(
-        [&](std::size_t wei_k_id, std::size_t wei_c_id, auto... wei_spatial_id_pack) {
-            auto wei_spatial_id = make_array(wei_spatial_id_pack...);
-
-            std::size_t group_id = wei_k_id / wei_k_len_per_group;
-            std::size_t in_c_id  = group_id * wei_c_len + wei_c_id;
-
-            Tacc acc = 0;
-
-            miopen::ford(out_n_len)([&](std::size_t out_n_id) {
-                auto ford_out_spatial = miopen::unpacker(miopen::ford)(out_spatial_len);
-
-                ford_out_spatial([&](auto... out_spatial_id_pack) {
-                    auto out_spatial_id = make_array(out_spatial_id_pack...);
-
-                    std::array<std::ptrdiff_t, ConvDim> in_spatial_id{};
-
-                    for(std::size_t i = 0; i < ConvDim; ++i)
-                    {
-                        in_spatial_id[i] = out_spatial_id[i] * strides[i] +
-                                           wei_spatial_id[i] * dilations[i] - pads[i];
-                    }
-
-                    bool out_of_bound = false;
-                    for(std::size_t i = 0; i < ConvDim; ++i)
-                    {
-                        out_of_bound = out_of_bound or (in_spatial_id[i] < 0 or
-                                                        in_spatial_id[i] >= in_spatial_len[i]);
-                    }
-
-                    if(!out_of_bound)
-                    {
-                        std::array<std::size_t, ConvDim + 2> in_id{};
-                        in_id[0] = out_n_id;
-                        in_id[1] = in_c_id;
-                        std::copy_n(in_spatial_id.begin(), ConvDim, in_id.begin() + 2);
-                        Tacc tmp1 = fi(in(in_id));
-                        Tacc tmp2 = fo(out(out_n_id, wei_k_id, out_spatial_id_pack...));
-                        acc += tmp1 * tmp2;
-                    }
-                });
-
-                wei(wei_k_id, wei_c_id, wei_spatial_id_pack...) = static_cast<Twei>(acc);
-            });
-        });
-}
-
-template <typename Tin,
-          typename Twei,
-          typename Tout,
-          typename Range,
-          typename Tacc = double,
-          typename FI   = PassThru<Tin>,
-          typename FW   = PassThru<Twei>>
-void cpu_convolution_forward(std::size_t spatial_dim,
-                             const tensor<Tin>& in,
-                             const tensor<Twei>& wei,
-                             tensor<Tout>& out,
-                             const Range& pads,
-                             const Range& strides,
-                             const Range& dilations,
-                             std::size_t group_count,
-                             FI fi = {},
-                             FW fw = {})
-{
-    switch(spatial_dim)
-    {
-    case 1: {
-        cpu_convolution_forward_impl<1, Tacc>(
-            in, wei, out, pads, strides, dilations, group_count, fi, fw);
-        break;
-    }
-    case 2: {
-        cpu_convolution_forward_impl<2, Tacc>(
-            in, wei, out, pads, strides, dilations, group_count, fi, fw);
-        break;
-    }
-    case 3: {
-        cpu_convolution_forward_impl<3, Tacc>(
-            in, wei, out, pads, strides, dilations, group_count, fi, fw);
-        break;
-    }
-    case 4: {
-        cpu_convolution_forward_impl<4, Tacc>(
-            in, wei, out, pads, strides, dilations, group_count, fi, fw);
-        break;
-    }
-    default: {
-        MIOPEN_THROW("not belong to any case");
-    }
-    }
-}
-
-template <typename Tin,
-          typename Twei,
-          typename Tout,
-          typename Range,
-          typename Tacc = double,
-          typename FW   = PassThru<Twei>,
-          typename FO   = PassThru<Tout>>
-void cpu_convolution_backward_data(std::size_t spatial_dim,
-                                   tensor<Tin>& in,
-                                   const tensor<Twei>& wei,
-                                   const tensor<Tout>& out,
-                                   const Range& pads,
-                                   const Range& strides,
-                                   const Range& dilations,
-                                   std::size_t group_count,
-                                   FW fw = {},
-                                   FO fo = {})
-{
-    switch(spatial_dim)
-    {
-    case 1: {
-        cpu_convolution_backward_data_impl<1, Tacc>(
-            in, wei, out, pads, strides, dilations, group_count, fw, fo);
-        break;
-    }
-    case 2: {
-        cpu_convolution_backward_data_impl<2, Tacc>(
-            in, wei, out, pads, strides, dilations, group_count, fw, fo);
-        break;
-    }
-    case 3: {
-        cpu_convolution_backward_data_impl<3, Tacc>(
-            in, wei, out, pads, strides, dilations, group_count, fw, fo);
-        break;
-    }
-    case 4: {
-        cpu_convolution_backward_data_impl<4, Tacc>(
-            in, wei, out, pads, strides, dilations, group_count, fw, fo);
-        break;
-    }
-    default: {
-        MIOPEN_THROW("not belong to any case");
-    }
-    }
-}
-
-template <typename Tin,
-          typename Twei,
-          typename Tout,
-          typename Range,
-          typename Tacc = double,
-          typename FI   = PassThru<Tin>,
-          typename FO   = PassThru<Tout>>
-void cpu_convolution_backward_weight(std::size_t spatial_dim,
-                                     const tensor<Tin>& in,
-                                     tensor<Twei>& wei,
-                                     const tensor<Tout>& out,
-                                     const Range& pads,
-                                     const Range& strides,
-                                     const Range& dilations,
-                                     std::size_t group_count,
-                                     FI fi = {},
-                                     FO fo = {})
-{
-    switch(spatial_dim)
-    {
-    case 1: {
-        cpu_convolution_backward_weight_impl<1, Tacc>(
-            in, wei, out, pads, strides, dilations, group_count, fi, fo);
-        break;
-    }
-    case 2: {
-        cpu_convolution_backward_weight_impl<2, Tacc>(
-            in, wei, out, pads, strides, dilations, group_count, fi, fo);
-        break;
-    }
-    case 3: {
-        cpu_convolution_backward_weight_impl<3, Tacc>(
-            in, wei, out, pads, strides, dilations, group_count, fi, fo);
-        break;
-    }
-    case 4: {
-        cpu_convolution_backward_weight_impl<4, Tacc>(
-            in, wei, out, pads, strides, dilations, group_count, fi, fo);
-        break;
-    }
-    default: {
-        MIOPEN_THROW("not belong to any case");
-    }
-    }
-}
+#include <miopen_utils/cpu_conv.hpp>
 #endif
diff --git a/projects/miopen/test/cpu_layernorm.hpp b/projects/miopen/test/cpu_layernorm.hpp
index 8b5bf965deab..9f1c7a55ba42 100644
--- a/projects/miopen/test/cpu_layernorm.hpp
+++ b/projects/miopen/test/cpu_layernorm.hpp
@@ -1,216 +1,5 @@
-// Copyright © Advanced Micro Devices, Inc., or its affiliates.
-// SPDX-License-Identifier: MIT
+// Forwarding header — implementation moved to miopen_utils.
 #ifndef GUARD_CPU_CONV_HPP
 #define GUARD_CPU_CONV_HPP
-
-#include <../test/tensor_holder.hpp>
-
-template <class T>
-void cpu_layernorm_forward(tensor<T> input,
-                           tensor<T> weight,
-                           tensor<T> bias,
-                           tensor<T>& ref_output,
-                           tensor<T>& ref_mean,
-                           tensor<T>& ref_rstd,
-                           float eps,
-                           int32_t dim,
-                           miopenNormMode_t mode,
-                           bool use_multithread = false)
-{
-    auto layout   = input.desc.GetLayoutEnum();
-    size_t stride = 1;
-    if(dim > 1 && layout.has_value() &&
-       (layout.value() == miopenTensorNHWC || layout.value() == miopenTensorNDHWC))
-    {
-        stride = input.desc.GetLengths()[1]; // stride = C
-    }
-
-    auto dims         = input.desc.GetLengths();
-    size_t outer_size = 1;
-    size_t inner_size = 1;
-    for(size_t i = 0; i < dims.size(); ++i)
-    {
-        if(i < dim)
-        {
-            if(!(stride > 1 && i == 1))
-            {
-                outer_size *= dims[i];
-            }
-        }
-        else
-        {
-            inner_size *= dims[i];
-        }
-    }
-
-    size_t min_grain = use_multithread ? 8 : outer_size;
-    miopen::par_for(outer_size, min_grain, [&](int32_t o) {
-        miopen::ford(stride)([&](int32_t s) {
-            double mean_v = 0.0;
-            double var_v  = 0.0;
-
-            miopen::ford(inner_size)([&](int32_t i) {
-                double tmp = static_cast<double>(input[o * inner_size * stride + i * stride + s]);
-                mean_v += tmp;
-                var_v += tmp * tmp;
-            });
-
-            mean_v        = mean_v / inner_size;
-            var_v         = var_v / inner_size - mean_v * mean_v;
-            double rstd_v = 1.0 / sqrt(var_v + eps);
-
-            ref_mean[o * stride + s] = static_cast<T>(mean_v);
-            ref_rstd[o * stride + s] = static_cast<T>(rstd_v);
-
-            miopen::ford(inner_size)([&](int32_t i) {
-                double weight_v =
-                    (mode == MIOPEN_ELEMENTWISE_AFFINE) ? 1.0 : static_cast<float>(weight[i]);
-                double bias_v =
-                    (mode == MIOPEN_ELEMENTWISE_AFFINE) ? 0.0 : static_cast<float>(bias[i]);
-
-                ref_output[o * inner_size * stride + i * stride + s] = static_cast<T>(
-                    (static_cast<double>(input[o * inner_size * stride + i * stride + s]) -
-                     mean_v) *
-                        rstd_v * weight_v +
-                    bias_v);
-            });
-        });
-    });
-}
-
-template <class T>
-void cpu_layernorm_backward(tensor<T> dy,
-                            tensor<T> x,
-                            tensor<T> weight,
-                            tensor<T> mean,
-                            tensor<T> rstd,
-                            tensor<T>& ref_dx,
-                            int32_t dim,
-                            miopenNormMode_t mode,
-                            bool use_multithread = false)
-{
-    auto layout   = dy.desc.GetLayoutEnum();
-    size_t stride = 1;
-    if(dim > 1 && (layout == miopenTensorNHWC || layout == miopenTensorNDHWC))
-    {
-        stride = dy.desc.GetLengths()[1]; // stride = C
-    }
-
-    auto dims         = dy.desc.GetLengths();
-    size_t outer_size = 1;
-    size_t inner_size = 1;
-    for(size_t i = 0; i < dims.size(); ++i)
-    {
-        if(i < dim)
-        {
-            if(!(stride > 1 && i == 1))
-            {
-                outer_size *= dims[i];
-            }
-        }
-        else
-        {
-            inner_size *= dims[i];
-        }
-    }
-
-    size_t min_grain = use_multithread ? 8 : outer_size;
-    miopen::par_for(outer_size, min_grain, [&](int32_t o) {
-        miopen::ford(stride)([&](int32_t s) {
-            double sum_dy_weight   = 0.0;
-            double sum_dy_weight_x = 0.0;
-
-            miopen::ford(inner_size)([&](int32_t i) {
-                double pweight =
-                    (mode == MIOPEN_ELEMENTWISE_AFFINE) ? 1.0 : static_cast<double>(weight[i]);
-                double pdy = (dy.GetSize() != 0)
-                                 ? static_cast<double>(dy[o * inner_size * stride + i * stride + s])
-                                 : 0.0;
-                double px  = static_cast<double>(x[o * inner_size * stride + i * stride + s]);
-
-                sum_dy_weight += pdy * pweight;
-                sum_dy_weight_x += pdy * px * pweight;
-            });
-
-            double scale = 1.0 / static_cast<double>(inner_size);
-            double prstd = static_cast<double>(rstd[o * stride + s]);
-            double pmean = static_cast<double>(mean[o * stride + s]);
-            double a = prstd * prstd * prstd * scale * (sum_dy_weight_x - sum_dy_weight * pmean);
-            double b = prstd * sum_dy_weight * scale - a * pmean;
-
-            miopen::ford(inner_size)([&](int32_t i) {
-                double pweight =
-                    (mode == MIOPEN_ELEMENTWISE_AFFINE) ? 1.0 : static_cast<double>(weight[i]);
-                double pdy = (dy.GetSize() != 0)
-                                 ? static_cast<double>(dy[o * inner_size * stride + i * stride + s])
-                                 : 0.0;
-                double val = prstd * pdy * pweight -
-                             a * static_cast<double>(x[o * inner_size * stride + i * stride + s]) -
-                             b;
-
-                ref_dx[o * inner_size * stride + i * stride + s] = static_cast<T>(val);
-            });
-        });
-    });
-}
-
-template <class T>
-void cpu_layernorm_backward_weight_bias(tensor<T> dy,
-                                        tensor<T> x,
-                                        tensor<T> mean,
-                                        tensor<T> rstd,
-                                        tensor<T>& ref_dw,
-                                        tensor<T>& ref_db,
-                                        int32_t dim,
-                                        bool use_multithread = false)
-{
-    auto layout   = dy.desc.GetLayoutEnum();
-    size_t stride = 1;
-    if(dim > 1 && (layout == miopenTensorNHWC || layout == miopenTensorNDHWC))
-    {
-        stride = dy.desc.GetLengths()[1]; // stride = C
-    }
-
-    auto dims         = dy.desc.GetLengths();
-    size_t outer_size = 1;
-    size_t inner_size = 1;
-    for(size_t i = 0; i < dims.size(); ++i)
-    {
-        if(i < dim)
-        {
-            if(!(stride > 1 && i == 1))
-            {
-                outer_size *= dims[i];
-            }
-        }
-        else
-        {
-            inner_size *= dims[i];
-        }
-    }
-
-    size_t min_grain = use_multithread ? 8 : inner_size;
-    miopen::par_for(inner_size, min_grain, [&](int32_t i) {
-        double sum_dw = 0.0;
-        double sum_db = 0.0;
-
-        miopen::ford(stride)([&](int32_t s) {
-            miopen::ford(outer_size)([&](int32_t o) {
-                double prstd = static_cast<double>(rstd[o * stride + s]);
-                double pmean = static_cast<double>(mean[o * stride + s]);
-                double pdy   = (dy.GetSize() != 0)
-                                   ? static_cast<double>(dy[o * inner_size * stride + i * stride + s])
-                                   : 0;
-                double px    = static_cast<double>(x[o * inner_size * stride + i * stride + s]);
-
-                sum_dw += pdy * (px - pmean) * prstd;
-                sum_db += pdy;
-            });
-        });
-
-        ref_dw[i] = sum_dw;
-        ref_db[i] = sum_db;
-    });
-}
-
+#include <miopen_utils/cpu_layernorm.hpp>
 #endif
diff --git a/projects/miopen/test/cpu_reduce_util.hpp b/projects/miopen/test/cpu_reduce_util.hpp
index 88728b02faec..73de3b18e2e1 100644
--- a/projects/miopen/test/cpu_reduce_util.hpp
+++ b/projects/miopen/test/cpu_reduce_util.hpp
@@ -1,649 +1,5 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2020 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
+// Forwarding header — implementation moved to miopen_utils.
 #ifndef GUARD_CPU_REDUCE_UTIL_HPP
 #define GUARD_CPU_REDUCE_UTIL_HPP
-
-#include "miopen/reducetensor.hpp"
-#include "tensor_holder.hpp"
-#include <cstddef>
-#include <half/half.hpp>
-#include <limits>
-#include <cmath>
-#include <cassert>
-#include <ratio>
-#include <stdexcept>
-#include <string>
-#include <miopen/miopen.h>
-#include <miopen/reduce_common.hpp>
-
-namespace reduce {
-
-template <typename T>
-static inline bool float_equal_one(T);
-
-static inline bool float_equal_one(float x) { return x == 1.0f; };
-
-static inline bool float_equal_one(double x) { return x == 1.0; };
-
-static inline bool float_equal_one(half_float::half x)
-{
-    return x == convert_type<half_float::half>(1.0f);
-};
-
-template <typename T>
-static inline bool float_equal_zero(T x);
-
-static inline bool float_equal_zero(float x) { return x == 0.0f; };
-
-static inline bool float_equal_zero(double x) { return x == 0.0; };
-
-static inline bool float_equal_zero(half_float::half x)
-{
-    return x == convert_type<half_float::half>(0.0f);
-};
-
-template <typename SizeT>
-static inline void build_radix(const std::vector<SizeT>& lens, std::vector<std::size_t>& radix)
-{
-    const std::size_t D = lens.size();
-    radix.assign(D, 1);
-    for(std::size_t d = D; d-- > 1;)
-        radix[d - 1] = radix[d] * static_cast<std::size_t>(lens[d]); // radix[d] = Π_{k>d} lens[k]
-}
-
-// i -> memory offset using lens-radix + actual strides
-template <typename SizeT>
-static inline std::size_t linear_to_offset_by_lens_strides(std::size_t i,
-                                                           const std::vector<SizeT>& lens,
-                                                           const std::vector<std::size_t>& radix,
-                                                           const std::vector<SizeT>& strides)
-{
-    std::size_t off = 0;
-    for(std::size_t d = 0; d < lens.size(); ++d)
-    {
-        const std::size_t idx_d = (i / radix[d]) % static_cast<std::size_t>(lens[d]);
-        off += idx_d * static_cast<std::size_t>(strides[d]);
-    }
-    return off;
-}
-
-template <typename compType>
-static inline std::function<void(compType&)> PreUnaryOpFn(miopenReduceTensorOp_t op_, std::size_t)
-{
-    using std::abs;
-
-    switch(op_)
-    {
-    case MIOPEN_REDUCE_TENSOR_NORM1: return ([&](compType& a_) { a_ = abs(a_); });
-    case MIOPEN_REDUCE_TENSOR_NORM2: return ([&](compType& a_) { a_ = a_ * a_; });
-    case MIOPEN_REDUCE_TENSOR_AMAX: return ([&](compType& a_) { a_ = abs(a_); });
-
-    case MIOPEN_REDUCE_TENSOR_AVG:
-    case MIOPEN_REDUCE_TENSOR_ADD:
-    case MIOPEN_REDUCE_TENSOR_MUL:
-    case MIOPEN_REDUCE_TENSOR_MIN:
-    case MIOPEN_REDUCE_TENSOR_MAX: return ([&](compType&) {});
-    }
-
-    throw std::runtime_error(std::string(__FUNCTION__) +
-                             ": using undefined Reduction operation is not permitted");
-};
-
-template <typename compType>
-static inline std::function<void(compType&)> PosUnaryOpFn(miopenReduceTensorOp_t op_,
-                                                          std::size_t divider)
-{
-    using std::sqrt;
-
-    switch(op_)
-    {
-    case MIOPEN_REDUCE_TENSOR_NORM2: return ([&](compType& a_) { a_ = sqrt(a_); });
-
-    case MIOPEN_REDUCE_TENSOR_AVG:
-        return ([&, divider](compType& a_) {
-            a_ = a_ / convert_type<compType>(static_cast<float>(divider));
-        });
-
-    case MIOPEN_REDUCE_TENSOR_ADD:
-    case MIOPEN_REDUCE_TENSOR_NORM1:
-    case MIOPEN_REDUCE_TENSOR_MUL:
-    case MIOPEN_REDUCE_TENSOR_MIN:
-    case MIOPEN_REDUCE_TENSOR_MAX:
-    case MIOPEN_REDUCE_TENSOR_AMAX: return ([&](compType&) {});
-    }
-
-    throw std::runtime_error(std::string(__FUNCTION__) +
-                             ": using undefined Reduction operation is not permitted");
-};
-
-template <typename compType>
-static inline std::function<void(compType&, compType)> ReduceOpFn(miopenReduceTensorOp_t op_)
-{
-    switch(op_)
-    {
-    case MIOPEN_REDUCE_TENSOR_ADD:
-    case MIOPEN_REDUCE_TENSOR_AVG:
-    case MIOPEN_REDUCE_TENSOR_NORM1:
-    case MIOPEN_REDUCE_TENSOR_NORM2: return ([&](compType& a_, compType b_) { a_ = a_ + b_; });
-
-    case MIOPEN_REDUCE_TENSOR_MUL: return ([&](compType& a_, compType b_) { a_ = a_ * b_; });
-
-    case MIOPEN_REDUCE_TENSOR_MIN:
-        return ([&](compType& a_, compType b_) {
-            if(a_ > b_)
-                a_ = b_;
-        });
-
-    case MIOPEN_REDUCE_TENSOR_MAX:
-    case MIOPEN_REDUCE_TENSOR_AMAX:
-        return ([&](compType& a_, compType b_) {
-            if(a_ < b_)
-                a_ = b_;
-        });
-    }
-
-    throw std::runtime_error(std::string(__FUNCTION__) +
-                             ": using undefined Reduction operation is not permitted");
-};
-
-template <typename compType>
-static inline std::function<void(compType&, compType, bool& changed)>
-ReduceOpFn2(miopenReduceTensorOp_t op_)
-{
-    switch(op_)
-    {
-    case MIOPEN_REDUCE_TENSOR_MIN:
-        return ([&](compType& a_, compType b_, bool& changed) {
-            if(a_ > b_)
-            {
-                a_      = b_;
-                changed = true;
-            }
-            else
-            {
-                changed = false;
-            }
-        });
-
-    case MIOPEN_REDUCE_TENSOR_MAX:
-    case MIOPEN_REDUCE_TENSOR_AMAX:
-        return ([&](compType& a_, compType b_, bool& changed) {
-            if(a_ < b_)
-            {
-                a_      = b_;
-                changed = true;
-            }
-            else
-            {
-                changed = false;
-            }
-        });
-
-    case MIOPEN_REDUCE_TENSOR_ADD:
-    case MIOPEN_REDUCE_TENSOR_MUL:
-    case MIOPEN_REDUCE_TENSOR_AVG:
-    case MIOPEN_REDUCE_TENSOR_NORM1:
-    case MIOPEN_REDUCE_TENSOR_NORM2: return (std::function<void(compType&, compType, bool&)>{});
-    };
-
-    throw std::runtime_error(std::string(__FUNCTION__) +
-                             ": using undefined Reduction operation is not permitted");
-};
-
-template <typename compType>
-static inline compType ReduceOpZeroVal(miopenReduceTensorOp_t op_)
-{
-    switch(op_)
-    {
-    case MIOPEN_REDUCE_TENSOR_ADD:
-    case MIOPEN_REDUCE_TENSOR_AVG:
-    case MIOPEN_REDUCE_TENSOR_NORM1:
-    case MIOPEN_REDUCE_TENSOR_NORM2: return (convert_type<compType>(0.0f));
-
-    case MIOPEN_REDUCE_TENSOR_MUL: return (convert_type<compType>(1.0f));
-
-    case MIOPEN_REDUCE_TENSOR_MIN: return (std::numeric_limits<compType>::max());
-
-    case MIOPEN_REDUCE_TENSOR_MAX: return (std::numeric_limits<compType>::lowest());
-    case MIOPEN_REDUCE_TENSOR_AMAX: return (convert_type<compType>(0.0f));
-    }
-
-    throw std::runtime_error(std::string(__FUNCTION__) +
-                             ": using undefined Reduction operation is not permitted");
-};
-
-template <typename compType, typename reduceOpT>
-static inline void binop_with_nan_check(miopenNanPropagation_t nanOpt,
-                                        reduceOpT&& opReduce,
-                                        compType& accuVal,
-                                        compType currVal)
-{
-    using std::isnan;
-
-    if(nanOpt == MIOPEN_NOT_PROPAGATE_NAN)
-    {
-        opReduce(accuVal, currVal);
-    }
-    else
-    {
-        if(isnan(currVal))
-            accuVal = currVal;
-        else
-            opReduce(accuVal, currVal);
-    };
-};
-
-template <typename compType, typename reduceOpT>
-static inline void binop_with_nan_check2(miopenNanPropagation_t nanOpt,
-                                         reduceOpT&& opReduce,
-                                         compType& accuVal,
-                                         compType currVal,
-                                         int& accuIndex,
-                                         int currIndex)
-{
-    using std::isnan;
-
-    if(nanOpt == MIOPEN_NOT_PROPAGATE_NAN)
-    {
-        bool changed;
-
-        opReduce(accuVal, currVal, changed);
-
-        if(changed)
-            accuIndex = currIndex;
-    }
-    else
-    {
-        if(isnan(currVal))
-        {
-            accuVal   = currVal;
-            accuIndex = currIndex;
-        }
-        else
-        {
-            bool changed;
-
-            opReduce(accuVal, currVal, changed);
-
-            if(changed)
-                accuIndex = currIndex;
-        };
-    };
-};
-
-}; // end of namespace reduce
-
-template <typename T>
-std::vector<std::vector<T>> get_all_indexes(const std::vector<T>& lens)
-{
-    const std::size_t D = lens.size();
-    assert(D > 0);
-
-    std::size_t N = 1;
-    for(const auto L : lens)
-        N *= static_cast<std::size_t>(L);
-
-    std::vector<std::vector<T>> out;
-    out.resize(N);
-    for(auto& row : out)
-        row.resize(D);
-
-    std::vector<std::size_t> stride(D, 1);
-    for(std::size_t d = D; d-- > 1;)
-        stride[d - 1] = stride[d] * static_cast<std::size_t>(lens[d]);
-
-    for(std::size_t r = 0; r < N; ++r)
-    {
-        for(std::size_t d = 0; d < D; ++d)
-            out[r][d] = static_cast<T>((r / stride[d]) % static_cast<std::size_t>(lens[d]));
-    }
-
-    return out;
-}
-
-template <typename T>
-static inline T
-linear_to_offset(size_t li, const std::vector<T>& lens, const std::vector<T>& strides)
-{
-    T off = 0;
-    for(int d = int(lens.size()) - 1; d >= 0; --d)
-    {
-        const T idx = li % lens[d];
-        li /= lens[d];
-        off += idx * strides[d];
-    }
-    return off;
-}
-
-template <typename T>
-T get_offset_from_index(const std::vector<T>& strides, const std::vector<T>& index)
-{
-    T offset = 0;
-
-    assert(strides.size() == index.size());
-
-    for(int i = 0; i < index.size(); i++)
-        offset += strides[i] * index[i];
-
-    return (offset);
-};
-
-template <typename T>
-T get_flatten_offset(const std::vector<T>& lengths, const std::vector<T>& index)
-{
-    T offset = 0;
-
-    assert(lengths.size() == index.size() && !lengths.empty());
-
-    int len  = lengths.size();
-    T stride = 1;
-
-    // for len==1, the loop is not executed
-    for(int i = len - 1; i > 0; i--)
-    {
-        offset += stride * index[i];
-
-        stride *= lengths[i];
-    };
-
-    offset += stride * index[0];
-
-    return (offset);
-};
-
-template <typename compType>
-struct Reducer
-{
-    compType acc;
-    bool withIdx;
-    int idx; // meaningful only when WithIdx==true
-    miopenNanPropagation_t nanOpt;
-    // functors for reduction
-    decltype(reduce::ReduceOpFn<compType>(MIOPEN_REDUCE_TENSOR_ADD)) opNoIdx;
-    decltype(reduce::ReduceOpFn2<compType>(MIOPEN_REDUCE_TENSOR_ADD)) opWithIdx;
-
-    Reducer(miopenNanPropagation_t n, miopenReduceTensorOp_t rop, compType zero, bool useIdx)
-        : acc(zero),
-          withIdx(useIdx),
-          idx(0),
-          nanOpt(n),
-          opNoIdx(reduce::ReduceOpFn<compType>(rop)),
-          opWithIdx(reduce::ReduceOpFn2<compType>(rop))
-    {
-    }
-
-    inline void step(compType v, int flat_i)
-    {
-        if(withIdx)
-            reduce::binop_with_nan_check2(nanOpt, opWithIdx, acc, v, idx, flat_i);
-        else
-            reduce::binop_with_nan_check(nanOpt, opNoIdx, acc, v);
-    }
-
-    inline void combine(const Reducer& other)
-    {
-        if(withIdx)
-            reduce::binop_with_nan_check2(nanOpt, opWithIdx, acc, other.acc, idx, other.idx);
-        else
-            reduce::binop_with_nan_check(nanOpt, opNoIdx, acc, other.acc);
-    }
-};
-
-template <typename Tgpu, typename Tref, typename compType, typename SizeT>
-std::tuple<tensor<Tref>, tensor<int>> reduce_cpu_common(const miopenReduceTensorOp_t& reduceOp,
-                                                        const miopenNanPropagation_t& nanOpt,
-                                                        const std::vector<SizeT>& inLengths,
-                                                        const std::vector<SizeT>& outLengths,
-                                                        const std::vector<Tgpu>& input,
-                                                        const std::vector<SizeT>& inStrides,
-                                                        const std::vector<Tref>& output,
-                                                        const std::vector<SizeT>& outStrides,
-                                                        float alpha,
-                                                        float beta,
-                                                        bool parallel,
-                                                        bool withIdx)
-{
-    using reduce::convert_type;
-    using reduce::ReduceOpZeroVal;
-
-    // Partition dims
-    std::vector<int> invariantDims, toReduceDims;
-    std::vector<std::size_t> invLens, redLens, invStrides_v, redStrides_v;
-
-    for(int i = 0; i < static_cast<int>(inLengths.size()); ++i)
-    {
-        if(inLengths[i] == outLengths[i])
-        {
-            invariantDims.push_back(i);
-            invLens.push_back(inLengths[i]);
-            invStrides_v.push_back(inStrides[i]);
-        }
-        else
-        {
-            toReduceDims.push_back(i);
-            redLens.push_back(inLengths[i]);
-            redStrides_v.push_back(inStrides[i]);
-        }
-    }
-
-    const bool reduceAllDims = invariantDims.empty();
-
-    // unary ops & zero vals
-    const compType zeroV = ReduceOpZeroVal<compType>(reduceOp);
-
-    // divider = Π reduced dims (or N if reduce-all)
-    std::size_t divider = 1;
-    if(reduceAllDims)
-        divider = std::accumulate(
-            inLengths.begin(), inLengths.end(), std::size_t{1}, std::multiplies<>());
-    else
-        divider =
-            std::accumulate(redLens.begin(), redLens.end(), std::size_t{1}, std::multiplies<>());
-
-    auto PreUnaryOp = reduce::PreUnaryOpFn<compType>(reduceOp, divider);
-    auto PosUnaryOp = reduce::PosUnaryOpFn<compType>(reduceOp, divider);
-
-    // outputs
-    auto res         = tensor<Tref>{outLengths};
-    res.data         = output;
-    auto res_indices = tensor<int>{outLengths};
-    if(withIdx)
-        std::fill(res_indices.begin(), res_indices.end(), 0);
-
-    if(reduceAllDims)
-    {
-        // Flatten whole tensor
-        const std::size_t N = divider; // product of all dims
-        std::vector<std::size_t> lens_radix;
-        reduce::build_radix(inLengths, lens_radix);
-
-        // parallel chunking
-        std::size_t hw =
-            std::max(std::size_t{1}, static_cast<std::size_t>(std::thread::hardware_concurrency()));
-        const std::size_t P     = std::min(N, hw * 4ul);
-        const std::size_t chunk = (N + P - 1) / P;
-
-        std::vector<Reducer<compType>> partial;
-        partial.reserve(P);
-        for(std::size_t p = 0; p < P; ++p)
-            partial.emplace_back(nanOpt, reduceOp, zeroV, withIdx);
-
-        auto worker = [&](int p) {
-            const std::size_t begin = std::size_t(p) * chunk;
-            const std::size_t end   = std::min(begin + chunk, N);
-
-            auto& r = partial[p];
-            for(std::size_t i = begin; i < end; ++i)
-            {
-                const auto off =
-                    reduce::linear_to_offset_by_lens_strides(i, inLengths, lens_radix, inStrides);
-                auto v = convert_type<compType>(input[off]);
-                PreUnaryOp(v);
-                r.step(v, static_cast<int>(i)); // flat index across whole tensor
-            }
-        };
-
-        if(parallel)
-        {
-            miopen::par_for(static_cast<int>(P), worker);
-        }
-        else
-        {
-            for(int p = 0; p < P; ++p)
-            {
-                worker(p);
-            }
-        }
-
-        // combine
-        Reducer<compType> R(nanOpt, reduceOp, zeroV, withIdx);
-        for(std::size_t p = 0; p < P; ++p)
-            R.combine(partial[p]);
-
-        // post
-        PosUnaryOp(R.acc);
-        if(alpha != 1.0f)
-            R.acc *= convert_type<compType>(alpha);
-        if(beta != 0.0f)
-            R.acc += convert_type<compType>(output[0]) * convert_type<compType>(beta);
-
-        res.data[0] = convert_type<Tref>(R.acc);
-        if(withIdx)
-            res_indices.data[0] = R.idx;
-    }
-    else
-    {
-        // Build radices for invariant and reduced subspaces
-        std::vector<std::size_t> invRad, redRad;
-        reduce::build_radix(invLens, invRad);
-        reduce::build_radix(redLens, redRad);
-
-        const std::size_t INV =
-            std::accumulate(invLens.begin(), invLens.end(), std::size_t{1}, std::multiplies<>());
-        const std::size_t TR = divider;
-
-        std::size_t hw =
-            std::max(std::size_t{1}, static_cast<std::size_t>(std::thread::hardware_concurrency()));
-        const std::size_t Te    = std::min(hw * 4ul, std::max<std::size_t>(1, INV));
-        const std::size_t chunk = (INV + Te - 1) / Te;
-
-        auto worker = [&](int t) {
-            const std::size_t row0 = std::size_t(t) * chunk;
-            const std::size_t row1 = std::min(row0 + chunk, INV);
-
-            for(std::size_t r = row0; r < row1; ++r)
-            {
-                // decode invariant multi-index; compute base offsets
-                std::size_t tmp          = r;
-                std::size_t base_in_off  = 0;
-                std::size_t base_out_off = 0;
-                for(std::size_t k = 0; k < invLens.size(); ++k)
-                {
-                    const std::size_t idx = (tmp / invRad[k]) % invLens[k];
-                    base_in_off += idx * invStrides_v[k];
-                    base_out_off += idx * outStrides[invariantDims[k]];
-                }
-
-                Reducer<compType> R(nanOpt, reduceOp, zeroV, withIdx);
-
-                // iterate reduced subspace
-                for(std::size_t i = 0; i < TR; ++i)
-                {
-                    std::size_t tmp2    = i;
-                    std::size_t red_off = 0;
-                    for(std::size_t k = 0; k < redLens.size(); ++k)
-                    {
-                        const std::size_t idx = (tmp2 / redRad[k]) % redLens[k];
-                        red_off += idx * redStrides_v[k];
-                    }
-
-                    auto v = convert_type<compType>(input[base_in_off + red_off]);
-                    PreUnaryOp(v);
-                    R.step(v, static_cast<int>(i)); // flat index inside reduced subspace
-                }
-
-                PosUnaryOp(R.acc);
-                if(alpha != 1.0f)
-                    R.acc *= convert_type<compType>(alpha);
-                if(beta != 0.0f)
-                    R.acc +=
-                        convert_type<compType>(output[base_out_off]) * convert_type<compType>(beta);
-
-                res.data[base_out_off] = convert_type<Tref>(R.acc);
-                if(withIdx)
-                    res_indices.data[base_out_off] = R.idx;
-            }
-        };
-
-        if(parallel)
-        {
-            miopen::par_for(static_cast<int>(Te), worker);
-        }
-        else
-        {
-            for(int te = 0; te < Te; ++te)
-            {
-                worker(te);
-            }
-        }
-    }
-
-    return {res, res_indices};
-}
-
-template <typename T, typename compType>
-std::tuple<tensor<T>, tensor<int>>
-reduce_cpu_common(const miopen::ReduceTensorDescriptor& reduceDesc,
-                  const tensor<T>& input,
-                  const tensor<T>& output,
-                  float alpha,
-                  float beta,
-                  bool parallel,
-                  bool withIdx)
-{
-    auto inLengths  = input.desc.GetLengths();
-    auto outLengths = output.desc.GetLengths();
-    auto inStrides  = input.desc.GetStrides();
-    auto outStrides = output.desc.GetStrides();
-
-    const auto reduceOp = reduceDesc.reduceTensorOp_;
-    const auto nanOpt   = reduceDesc.reduceTensorNanOpt_;
-
-    return reduce_cpu_common<T, T, compType, std::size_t>(reduceOp,
-                                                          nanOpt,
-                                                          inLengths,
-                                                          outLengths,
-                                                          input.data,
-                                                          inStrides,
-                                                          output.data,
-                                                          outStrides,
-                                                          alpha,
-                                                          beta,
-                                                          parallel,
-                                                          withIdx);
-}
-
+#include <miopen_utils/cpu_reduce_util.hpp>
 #endif
diff --git a/projects/miopen/test/fusionHost.hpp b/projects/miopen/test/fusionHost.hpp
index 9693295959d7..a13ee5601cd4 100644
--- a/projects/miopen/test/fusionHost.hpp
+++ b/projects/miopen/test/fusionHost.hpp
@@ -1,994 +1,3 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2018 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
+// Forwarding header — implementation moved to miopen_utils.
 #pragma once
-#include <array>
-#include <iostream>
-#include <iterator>
-#include <limits>
-#include <memory>
-#include <utility>
-#include <miopen/miopen.h>
-#include <miopen/convolution.hpp>
-#include <miopen/batch_norm.hpp>
-#include <miopen/activ.hpp>
-#include <miopen/tensor.hpp>
-#include <miopen/fusion_plan.hpp>
-#include "get_handle.hpp"
-#include "tensor_holder.hpp"
-#include "verify.hpp"
-
-template <class T>
-void convHostForward(const tensor<T>& input,
-                     tensor<T>& output,
-                     const tensor<T>& weights,
-                     const int bias_mode,
-                     const tensor<T>& bias,
-                     const miopenConvolutionDescriptor_t convDesc)
-{
-
-    int in_n, in_c, in_h, in_w;
-    int in_nstride, in_cstride, in_hstride, in_wstride;
-    std::tie(in_n, in_c, in_h, in_w) = miopen::tien<4>(input.desc.GetLengths());
-    std::tie(in_nstride, in_cstride, in_hstride, in_wstride) =
-        miopen::tien<4>(input.desc.GetStrides());
-
-    int wei_n, wei_c, wei_h, wei_w;
-    int wei_nstride, wei_cstride, wei_hstride, wei_wstride;
-    std::tie(wei_n, wei_c, wei_h, wei_w) = miopen::tien<4>(weights.desc.GetLengths());
-    std::tie(wei_nstride, wei_cstride, wei_hstride, wei_wstride) =
-        miopen::tien<4>(weights.desc.GetStrides());
-
-    int out_n, out_c, out_h, out_w;
-    int out_nstride, out_cstride, out_hstride, out_wstride;
-    std::tie(out_n, out_c, out_h, out_w) = miopen::tien<4>(output.desc.GetLengths());
-    std::tie(out_nstride, out_cstride, out_hstride, out_wstride) =
-        miopen::tien<4>(output.desc.GetStrides());
-
-    int stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w;
-    miopenConvolutionMode_t mode;
-    miopenPaddingMode_t pmode = miopen::deref(convDesc).paddingMode;
-    miopenGetConvolutionDescriptor(
-        convDesc, &mode, &pad_h, &pad_w, &stride_h, &stride_w, &dilation_h, &dilation_w);
-
-    if(pmode == miopenPaddingSame)
-    {
-        pad_h = (in_h % stride_h == 0) ? (std::max((wei_h - stride_h), 0))
-                                       : (std::max((wei_h - (in_h % stride_h)), 0));
-        pad_w = (in_w % stride_w == 0) ? (std::max((wei_w - stride_w), 0))
-                                       : (std::max((wei_w - (in_w % stride_w)), 0));
-        pad_h /= 2;
-        pad_w /= 2;
-    }
-    else if(pmode == miopenPaddingValid)
-    {
-        pad_h = 0;
-        pad_w = 0;
-    }
-
-    if(out_h <= 0 || out_w <= 0)
-        MIOPEN_THROW("Invalid Test Case: Check Output Dimension.");
-
-    for(int o = 0; o < out_n; o++)
-    { // mini-batch size
-        for(int w = 0; w < out_c; w++)
-        { // out_channels (num filters)
-            for(int i = 0; i < out_h; i++)
-            { // output_height (from getforwardoutputdim())
-                int in_off_h = i * stride_h;
-                for(int j = 0; j < out_w; j++)
-                { // output_width (from getforwardoutputdim())
-                    /*auto acc     = static_cast<T>(0.);*/
-                    auto acc     = static_cast<double>(0.);
-                    int in_off_w = j * stride_w;
-                    for(int k = 0; k < in_c; k++)
-                    { // in_channels (RGB)
-                        for(int x = 0; x < wei_h; x++)
-                        {
-                            int in_x = in_off_h - pad_h + x * dilation_h;
-                            if(in_x >= 0 && in_x < in_h)
-                            {
-                                for(int y = 0; y < wei_w; y++)
-                                {
-                                    int in_y = in_off_w - pad_w + y * dilation_w;
-                                    if(in_y >= 0 && in_y < in_w)
-                                    {
-                                        acc += double(
-                                            static_cast<T>(input[o * in_nstride + k * in_cstride +
-                                                                 in_x * in_w + in_y]) *
-                                            static_cast<T>(weights(w, k, x, y)));
-                                    }
-                                }
-                            }
-                        }
-                    }
-                    acc = bias_mode != 0 ? acc + static_cast<double>(bias[w]) : acc;
-                    output[o * out_nstride + w * out_cstride + i * out_hstride + j] =
-                        static_cast<T>(acc);
-                }
-            }
-        }
-    }
-}
-
-template <class T, class Tref, class U, class V = U>
-void batchNormSpatialHostInference(const tensor<T>& input,
-                                   tensor<Tref>& output,
-                                   const tensor<U>& scale,
-                                   const tensor<U>& bias,
-                                   double epsilon,
-                                   const tensor<V>& estimatedMean,
-                                   const tensor<V>& estimatedVariance,
-                                   bool useInverseVariance = false)
-{
-
-    int n_batches, channels, height, width;
-    std::tie(n_batches, channels, height, width) = miopen::tien<4>(input.desc.GetLengths());
-    miopen::par_for(channels, 1, [&](int cidx) { // via channel
-        V mean     = estimatedMean(0, cidx, 0, 0);
-        V variance = estimatedVariance(0, cidx, 0, 0);
-        double invertVar =
-            useInverseVariance ? static_cast<double>(variance) : 1.0 / sqrt(variance + epsilon);
-        // process the batch per channel
-        for(int row = 0; row < height; row++)
-        { // via rows
-            for(int column = 0; column < width; column++)
-            { // via columns
-                for(int bidx = 0; bidx < n_batches; bidx++)
-                { // via mini_batch
-                    double elemStd = static_cast<double>(input(bidx, cidx, row, column)) - mean;
-                    double inhat   = elemStd * invertVar;
-                    output(bidx, cidx, row, column) =
-                        static_cast<T>(scale(0, cidx, 0, 0) * inhat + bias(0, cidx, 0, 0));
-                    // printf("output: %f\n",scale(0, cidx, 0, 0) * inhat + bias(0, cidx, 0, 0));
-                }
-            }
-        }
-    });
-}
-
-template <class T, class U, class V, class Tref>
-void batchNormPerActivHostInference(const tensor<T>& input,
-                                    tensor<Tref>& output,
-                                    const tensor<U>& scale,
-                                    const tensor<U>& bias,
-                                    double epsilon,
-                                    const tensor<V>& estimatedMean,
-                                    const tensor<V>& estimatedVariance,
-                                    bool useInverseVariance = false)
-{
-    int n_batches, channels, height, width;
-    std::tie(n_batches, channels, height, width) = miopen::tien<4>(input.desc.GetLengths());
-    miopen::par_for(channels, 1, [&](int cidx) { // via channel
-        for(int row = 0; row < height; row++)
-        { // via rows
-            for(int column = 0; column < width; column++)
-            { // via columns
-                // apply down the n_batch dimension
-                double mean       = estimatedMean(0, cidx, row, column);
-                double variance   = estimatedVariance(0, cidx, row, column);
-                double elemInvVar = useInverseVariance ? variance : 1.0 / sqrt(variance + epsilon);
-                for(int bidx = 0; bidx < n_batches; bidx++)
-                { // via mini_batch
-                    // per (x-dims) channel load a block of data into LDS
-                    double elemStd = input(bidx, cidx, row, column) - mean;
-                    double inhat   = elemStd * elemInvVar;
-                    output(bidx, cidx, row, column) =
-                        scale(0, cidx, row, column) * inhat + bias(0, cidx, row, column);
-                    //    printf("output: %f\n",output(bidx, cidx, row, column));
-                }
-            }
-        }
-    });
-}
-
-template <class T, class U, class Tref = U, class Tout>
-void batchNormSpatialHostFwdTrain(const tensor<T>& input,
-                                  tensor<Tout>& out,
-                                  const tensor<U>& scale,
-                                  const tensor<U>& bias,
-                                  double epsilon,
-                                  double expAvgFactor,
-                                  tensor<Tref>& saveMean,
-                                  tensor<Tref>& saveInvVar,
-                                  tensor<Tref>& runMean,
-                                  tensor<Tref>& runVar)
-{
-
-    int height, width, n_batch, channels;
-    std::tie(n_batch, channels, height, width) = miopen::tien<4>(input.desc.GetLengths());
-    const auto nhw                             = double(height * width * n_batch);
-
-    miopen::par_for(channels, 1, [&](int cidx) {
-        double elemStd        = 0.;
-        double variance_accum = 0.;
-        double mean_accum     = 0.;
-        double invVar         = 0.;
-        double newRunMean     = 0.;
-        double adjust         = 0.;
-
-        // process the batch per channel
-        for(int bidx = 0; bidx < n_batch; bidx++)
-        { // via mini_batch
-            for(int row = 0; row < height; row++)
-            { // via rows
-                for(int column = 0; column < width; column++)
-                { // via columns
-                    // #1 calculate the mean
-                    // iterating through the stack of images in the mini_batch
-                    auto inval = static_cast<double>(input(bidx, cidx, row, column));
-                    mean_accum += inval;
-                    variance_accum += inval * inval;
-                } // end for (column)
-            } // end for (row)
-        } // end for (n)
-
-        mean_accum /= nhw;
-        variance_accum /= nhw;
-        variance_accum += (-mean_accum * mean_accum);
-        invVar = 1.0 / sqrt(variance_accum + epsilon);
-
-        // #4 apply the normalization
-        // x_hat = (x_i - mean) / sqrt(variance_accum + epsilon)
-        for(int bidx = 0; bidx < n_batch; bidx++)
-        { // via mini_batch
-            for(int row = 0; row < height; row++)
-            { // via rows
-                for(int column = 0; column < width; column++)
-                { // via columns
-                    // #5 Gamma and Beta adjust
-                    // y_i = gamma*x_hat + beta
-                    elemStd = (static_cast<double>(input(bidx, cidx, row, column)) -
-                               mean_accum); // (x_i - mean)
-                    out(bidx, cidx, row, column) = static_cast<T>(
-                        scale(0, cidx, 0, 0) * (invVar * elemStd) + bias(0, cidx, 0, 0));
-                } // for (column)
-            } // for (row)
-        } // end for(n_batchs)
-        if(!saveMean.data.empty())
-        {
-            saveMean(0, cidx, 0, 0)   = mean_accum;
-            saveInvVar(0, cidx, 0, 0) = invVar;
-        }
-        if(!runMean.data.empty())
-        {
-            newRunMean             = runMean(0, cidx, 0, 0) * (1 - expAvgFactor);
-            runMean(0, cidx, 0, 0) = mean_accum * expAvgFactor + newRunMean; // newMean*factor + tmp
-            // var(n+1) = p * var(n-1) + (1 - p)*(b/b-1)*var(n)
-            adjust = (n_batch * height * width == 1) ? variance_accum
-                                                     : (nhw / (nhw - 1)) * variance_accum;
-            runVar(0, cidx, 0, 0) =
-                (1 - expAvgFactor) * runVar(0, cidx, 0, 0) + expAvgFactor * adjust;
-        }
-    });
-}
-
-template <typename XDataType,
-          typename DxDataType,
-          typename DyDataType,
-          typename ScaleDataType,
-          typename AccDataType,
-          typename RefDataType>
-void batchNormSpatialHostBwdTrain(const tensor<XDataType>& x_input,
-                                  tensor<DyDataType>& dy_input,
-                                  tensor<DxDataType>& dx_out,
-                                  const tensor<ScaleDataType>& bnScale,
-                                  const tensor<ScaleDataType>& bnBias,
-                                  tensor<RefDataType>& dscale,
-                                  tensor<RefDataType>& dbias,
-                                  const tensor<AccDataType>& savedMean,
-                                  const tensor<AccDataType>& savedInvVar,
-                                  miopenActivationMode_t activ_mode,
-                                  double activ_beta,
-                                  double activ_alpha)
-{
-    double activ_gamma = 0.;
-    int height, width, n_batch, channels;
-    std::tie(n_batch, channels, height, width) = miopen::tien<4>(x_input.desc.GetLengths());
-    auto nhw                                   = double(height * width * n_batch);
-    int in_cstride                             = height * width;
-
-    if(activ_mode > 0)
-    {
-        tensor<AccDataType> input_norm =
-            tensor<AccDataType>{x_input.desc.GetLayout_t(), x_input.desc.GetLengths()};
-        miopen::par_for(channels, 1, [&](int cidx) {
-            double mean           = 0.0;
-            double invVar         = 0.0;
-            double elemStd        = 0.;
-            double mean_accum     = 0.0;
-            double variance_accum = 0.0;
-            if(!savedMean.data.empty())
-            {
-                mean   = static_cast<double>(savedMean(0, cidx, 0, 0));   // HxW elements
-                invVar = static_cast<double>(savedInvVar(0, cidx, 0, 0)); // HxW elements
-            }
-            else
-            {
-                for(int row = 0; row < height; row++)
-                { // via rows
-                    for(int column = 0; column < width; column++)
-                    { // via columns
-                        for(int bidx = 0; bidx < n_batch; bidx++)
-                        { // via mini_batch
-                            auto inval = static_cast<double>(x_input(bidx, cidx, row, column));
-                            mean_accum += inval;
-                            variance_accum += inval * inval;
-                        }
-                    }
-                }
-                mean_accum /= nhw;
-                variance_accum /= nhw;
-                variance_accum += (-mean_accum * mean_accum);
-                mean   = mean_accum;
-                invVar = 1.0 / sqrt(variance_accum);
-            }
-            for(int row = 0; row < height; row++)
-            { // via rows
-                for(int column = 0; column < width; column++)
-                { // via columns
-                    for(int bidx = 0; bidx < n_batch; bidx++)
-                    { // via mini_batch
-                        elemStd = static_cast<double>(x_input(bidx, cidx, row, column)) -
-                                  mean; // (x_i - mean)
-                        input_norm(bidx, cidx, row, column) = static_cast<AccDataType>(
-                            bnScale(0, cidx, 0, 0) * (elemStd * invVar) + bnBias(0, cidx, 0, 0));
-                    }
-                }
-            }
-        });
-
-        activationHostBnormBwd(activ_mode,
-                               activ_gamma,
-                               activ_beta,
-                               activ_alpha,
-                               dy_input.data,
-                               input_norm.data,
-                               dy_input.data);
-    }
-
-    miopen::par_for(channels, 1, [&](int cidx) {
-        double elemStd = 0.;
-        unsigned int xhat_index;
-        double mean   = 0.0;
-        double invVar = 0.0;
-        double dyelem = 0.;
-        std::vector<double> xhat(static_cast<std::size_t>(n_batch) * in_cstride, 0.0);
-        // process the batch per channel
-        dscale(0, cidx, 0, 0) = 0.;
-        dbias(0, cidx, 0, 0)  = 0.;
-
-        if(!savedMean.data.empty())
-        {
-
-            mean   = savedMean(0, cidx, 0, 0);   // HxW elements
-            invVar = savedInvVar(0, cidx, 0, 0); // HxW elements
-        }
-        else
-        {
-            double variance_accum = 0.;
-            double mean_accum     = 0.;
-            double inv_Var        = 0.;
-
-            // process the batch per channel
-            for(int bidx = 0; bidx < n_batch; bidx++)
-            { // via mini_batch
-                for(int row = 0; row < height; row++)
-                { // via rows
-                    for(int column = 0; column < width; column++)
-                    { // via columns
-                        // #1 calculate the mean
-                        // iterating through the stack of images in the mini_batch
-                        auto inval = static_cast<double>(x_input(bidx, cidx, row, column));
-                        mean_accum += inval;
-                        variance_accum += inval * inval;
-                    } // end for (column)
-                } // end for (row)
-            } // end for (n)
-
-            mean_accum /= nhw;
-            variance_accum /= nhw;
-            variance_accum += (-mean_accum * mean_accum);
-            inv_Var = 1.0 / sqrt(variance_accum);
-
-            mean   = mean_accum;
-            invVar = inv_Var;
-        }
-        for(int row = 0; row < height; row++)
-        { // via rows
-            for(int column = 0; column < width; column++)
-            { // via columns
-                for(int bidx = 0; bidx < n_batch; bidx++)
-                { // via mini_batch
-                    xhat_index = in_cstride * bidx + (width * row + column);
-                    // per (x-dims) channel load a block of data into LDS
-                    elemStd = static_cast<double>(x_input(bidx, cidx, row, column)) -
-                              mean; // (x_i - mean)
-                    xhat[xhat_index] = elemStd * invVar;
-                    dyelem           = static_cast<double>(dy_input(bidx, cidx, row, column));
-                    dbias(0, cidx, 0, 0) += dyelem;
-                    dscale(0, cidx, 0, 0) += xhat[xhat_index] * dyelem;
-                } // end for(n_batch)
-            } // for (column)
-        } // for (row)
-
-        for(int row = 0; row < height; row++)
-        { // via rows
-            for(int column = 0; column < width; column++)
-            { // via columns
-                for(int bidx = 0; bidx < n_batch; bidx++)
-                { // via mini_batch
-                    xhat_index = in_cstride * bidx + (width * row + column);
-
-                    double tmp1 = nhw * dy_input(bidx, cidx, row, column) - dbias(0, cidx, 0, 0);
-                    double tmp2 = -xhat[xhat_index] * dscale(0, cidx, 0, 0);
-                    double tmp3 = (bnScale(0, cidx, 0, 0) * invVar) / nhw;
-                    dx_out(bidx, cidx, row, column) =
-                        static_cast<RefDataType>(tmp3 * (tmp2 + tmp1));
-                } // end for(n_batchs)
-            } // for (column)
-        } // for (row)
-    }); // for (channel)
-}
-
-template <typename XDataType,
-          typename DyDataType,
-          typename DxDataType,
-          typename ScaleDataType,
-          typename AccDataType,
-          typename OutRefDataType,
-          typename RefDataType>
-void batchNormActivSpatialHostBwdTrain(miopenActivationMode_t activMode,
-                                       double gamma,
-                                       double beta,
-                                       double alpha,
-                                       const tensor<XDataType>& x_input,
-                                       const tensor<DyDataType>& dy_input,
-                                       const tensor<DxDataType>& y_input,
-                                       tensor<OutRefDataType>& dx_out,
-                                       const tensor<ScaleDataType>& bnScale,
-                                       const tensor<AccDataType>& bias,
-                                       tensor<RefDataType>& dscale,
-                                       tensor<RefDataType>& dbias,
-                                       const tensor<AccDataType>& savedMean,
-                                       const tensor<AccDataType>& savedInvVar)
-{
-
-    int height, width, n_batch, channels;
-    std::tie(n_batch, channels, height, width) = miopen::tien<4>(x_input.desc.GetLengths());
-    auto nhw                                   = double(height * width * n_batch);
-    int in_cstride                             = height * width;
-
-    miopen::par_for(channels, 1, [&](int cidx) {
-        double elemStd = 0.;
-        unsigned int xhat_index;
-        double mean   = static_cast<double>(savedMean(0, cidx, 0, 0));   // HxW elements
-        double invVar = static_cast<double>(savedInvVar(0, cidx, 0, 0)); // HxW elements
-        double dyelem = 0.;
-        std::vector<double> xhat(static_cast<std::size_t>(n_batch) * in_cstride, 0.0);
-        // process the batch per channel
-        dscale(0, cidx, 0, 0) = 0.;
-        dbias(0, cidx, 0, 0)  = 0.;
-
-        for(int row = 0; row < height; row++)
-        { // via rows
-            for(int column = 0; column < width; column++)
-            { // via columns
-                for(int bidx = 0; bidx < n_batch; bidx++)
-                { // via mini_batch
-
-                    // recompute forward batch norm
-                    xhat_index = in_cstride * bidx + (width * row + column);
-                    // per (x-dims) channel load a block of data into LDS
-                    elemStd = static_cast<double>(x_input(bidx, cidx, row, column)) -
-                              mean; // (x_i - mean)
-                    xhat[xhat_index] = elemStd * invVar;
-                    double bnrefowd =
-                        bnScale(0, cidx, 0, 0) * xhat[xhat_index] + bias(0, cidx, 0, 0);
-                    activationHostBwdElement(activMode,
-                                             gamma,
-                                             beta,
-                                             alpha,
-                                             dy_input(bidx, cidx, row, column),
-                                             bnrefowd,
-                                             y_input(bidx, cidx, row, column),
-                                             dyelem);
-                    dbias(0, cidx, 0, 0) += dyelem;
-                    dscale(0, cidx, 0, 0) += xhat[xhat_index] * dyelem;
-                } // end for(n_batch)
-            } // for (column)
-        } // for (row)
-
-        for(int row = 0; row < height; row++)
-        { // via rows
-            for(int column = 0; column < width; column++)
-            { // via columns
-                for(int bidx = 0; bidx < n_batch; bidx++)
-                { // via mini_batch
-                    xhat_index = in_cstride * bidx + (width * row + column);
-                    double bnrefowd =
-                        bnScale(0, cidx, 0, 0) * xhat[xhat_index] + bias(0, cidx, 0, 0);
-                    activationHostBwdElement(activMode,
-                                             gamma,
-                                             beta,
-                                             alpha,
-                                             dy_input(bidx, cidx, row, column),
-                                             bnrefowd,
-                                             y_input(bidx, cidx, row, column),
-                                             dyelem);
-                    // double tmp1 = nhw * dy_input(bidx, cidx, row, column) - dbias(0, cidx, 0, 0);
-                    double tmp1                     = nhw * dyelem - dbias(0, cidx, 0, 0);
-                    double tmp2                     = -xhat[xhat_index] * dscale(0, cidx, 0, 0);
-                    double tmp3                     = (bnScale(0, cidx, 0, 0) * invVar) / nhw;
-                    dx_out(bidx, cidx, row, column) = static_cast<DxDataType>(tmp3 * (tmp2 + tmp1));
-                } // end for(n_batchs)
-            } // for (column)
-        } // for (row)
-    }); // for (channel)
-}
-
-template <class T, class U, class Tref, class TOutref>
-void batchNormPerActHostFwdTrain(const tensor<T>& input,
-                                 tensor<TOutref>& out,
-                                 const tensor<U>& scale,
-                                 const tensor<U>& bias,
-                                 double epsilon,
-                                 double expAvgFactor,
-                                 tensor<Tref>& saveMean,
-                                 tensor<Tref>& saveInvVar,
-                                 tensor<Tref>& runMean,
-                                 tensor<Tref>& runVar)
-{
-
-    int height, width, n_batch, channels;
-    std::tie(n_batch, channels, height, width) = miopen::tien<4>(input.desc.GetLengths());
-    const auto n                               = double(n_batch);
-
-    miopen::par_for(channels, 1, [&](int cidx) {
-        double mean_accum     = 0.;
-        double variance_accum = 0.;
-        double elemStd        = 0.;
-        double elemInvVar     = 0.;
-        double inhat          = 0.;
-        double newRunMean     = 0.;
-        double adjust         = 0.;
-
-        // process the batch per channel
-        for(int row = 0; row < height; row++)
-        { // via rows
-            for(int column = 0; column < width; column++)
-            { // via columns
-
-                mean_accum     = 0.;
-                variance_accum = 0.;
-                for(int bidx = 0; bidx < n_batch; bidx++)
-                { // via mini_batch
-                    // #1 calculate the mean :: iterating through the stack of images in the
-                    // mini_batch
-                    auto intval = static_cast<double>(input(bidx, cidx, row, column));
-                    mean_accum += intval;
-                    variance_accum += intval * intval;
-                }
-                mean_accum /= n;
-                variance_accum /= n;
-                variance_accum = variance_accum - (mean_accum * mean_accum);
-                elemInvVar     = 1.0 / double(sqrt(variance_accum + epsilon));
-
-                // #4 apply the normalization :: x_hat = (x_i - mean) / sqrt(variance_accum -
-                // epsilon)
-                for(int bidx = 0; bidx < n_batch; bidx++)
-                {                                                            // via mini_batch
-                    elemStd = (input(bidx, cidx, row, column) - mean_accum); // (x_i - mean)
-                    inhat   = elemStd * elemInvVar;
-                    // #5 Gamma and Beta adjust :: y_i = gamma*x_hat + beta
-                    out(bidx, cidx, row, column) = static_cast<Tref>(
-                        scale(0, cidx, row, column) * inhat + bias(0, cidx, row, column));
-                } // end for(n_batch)
-
-                if(!runMean.data.empty())
-                {
-                    newRunMean = runMean(0, cidx, row, column) * (1.0 - expAvgFactor);
-                    runMean(0, cidx, row, column) =
-                        mean_accum * expAvgFactor + newRunMean; // newMean*factor + tmp
-                }
-                // var(n+1) = p * var(n-1) + (1 - p)*(b/b-1)*var(n)
-                if(!runVar.data.empty())
-                {
-                    adjust = (n_batch == 1) ? variance_accum : (n / (n - 1.0)) * variance_accum;
-                    runVar(0, cidx, row, column) =
-                        (1 - expAvgFactor) * runVar(0, cidx, row, column) + expAvgFactor * adjust;
-                }
-                if(!saveMean.data.empty() || !saveInvVar.data.empty())
-                {
-                    saveMean(0, cidx, row, column)   = static_cast<Tref>(mean_accum);
-                    saveInvVar(0, cidx, row, column) = static_cast<Tref>(elemInvVar);
-                }
-
-            } // for (column)
-        } // for (row)
-    });
-}
-
-template <typename XDataType,
-          typename DxDataType,
-          typename DyDataType = XDataType,
-          typename ScaleDataType,
-          typename AccDataType = ScaleDataType,
-          typename RefDataType = DxDataType>
-void batchNormPerActHostBwdTrain(const tensor<XDataType>& x_input,
-                                 const tensor<DyDataType>& dy_input,
-                                 tensor<DxDataType>& dx_out,
-                                 const tensor<ScaleDataType>& scale,
-                                 tensor<RefDataType>& dscale,
-                                 tensor<RefDataType>& dbias,
-                                 const tensor<AccDataType>& savedMean,
-                                 const tensor<AccDataType>& savedInvVar)
-{
-
-    int height, width, n_batch, channels;
-    std::tie(n_batch, channels, height, width) = miopen::tien<4>(x_input.desc.GetLengths());
-    int in_cstride                             = height * width;
-    auto n                                     = double(n_batch);
-
-    miopen::par_for(channels, 1, [&](int cidx) {
-        double elemStd = 0.;
-        unsigned int xhat_index;
-        double mean       = 0.;
-        double elemInvVar = 0.;
-        double dyelem     = 0.;
-        double dxhat      = 0.;
-        double dxhathat   = 0.;
-        double tmp1       = 0.;
-        std::vector<double> xhat(static_cast<std::size_t>(n_batch) * in_cstride);
-
-        // process the batch per channel
-        for(int row = 0; row < height; row++)
-        { // via rows
-            for(int column = 0; column < width; column++)
-            { // via columns
-                dxhat    = 0.;
-                dxhathat = 0.;
-
-                if(!savedMean.data.empty())
-                {
-                    mean       = savedMean(0, cidx, row, column);   // HxW elements
-                    elemInvVar = savedInvVar(0, cidx, row, column); // HxW elements
-                }
-                else
-                {
-                    double variance_accum = 0.;
-                    double mean_accum     = 0.;
-
-                    // process the batch per channel
-                    for(int bidx = 0; bidx < n_batch; bidx++)
-                    { // via mini_batch
-                        auto inval = static_cast<double>(x_input(bidx, cidx, row, column));
-                        mean_accum += inval;
-                        variance_accum += inval * inval;
-                    } // end for (n)
-
-                    mean_accum /= n;
-                    variance_accum /= n;
-                    variance_accum += (-mean_accum * mean_accum);
-
-                    mean       = mean_accum;
-                    elemInvVar = 1.0 / sqrt(variance_accum);
-                }
-
-                for(int bidx = 0; bidx < n_batch; bidx++)
-                { // via mini_batch
-                    xhat_index = in_cstride * bidx + (width * row + column);
-                    // per (x-dims) channel load a block of data into LDS
-                    elemStd = static_cast<double>(x_input(bidx, cidx, row, column)) -
-                              mean; // (x_i - mean)
-                    xhat[xhat_index] = elemStd * elemInvVar;
-                    dyelem           = static_cast<double>(dy_input(bidx, cidx, row, column));
-                    dbias(0, cidx, row, column) += dyelem;
-                    dscale(0, cidx, row, column) += xhat[xhat_index] * dyelem;
-                    tmp1 = scale(0, cidx, row, column) * dyelem;
-                    dxhat += tmp1;
-                    dxhathat += tmp1 * xhat[xhat_index];
-
-                } // end for(n_batchs)
-
-                for(int bidx = 0; bidx < n_batch; bidx++)
-                { // via mini_batch
-                    xhat_index = in_cstride * bidx + (width * row + column);
-                    tmp1       = xhat[xhat_index] * dxhathat + dxhat;
-                    double tmp2 =
-                        n_batch * scale(0, cidx, row, column) * dy_input(bidx, cidx, row, column) -
-                        tmp1;
-                    double tmp3                     = elemInvVar / (double(n));
-                    dx_out(bidx, cidx, row, column) = static_cast<DxDataType>(tmp3 * tmp2);
-                } // end for(n_batchs)
-            } // for (column)
-        } // for (row)
-    });
-}
-
-template <class T, class U>
-void batchNormActivPerActHostBwdTrain(miopenActivationMode_t activMode,
-                                      double gamma,
-                                      double beta,
-                                      double alpha,
-                                      const tensor<T>& x_input,
-                                      const tensor<T>& dy_input,
-                                      const tensor<T>& y_input,
-                                      tensor<T>& dx_out,
-                                      const tensor<U>& scale,
-                                      const tensor<U>& bias,
-                                      tensor<U>& dscale,
-                                      tensor<U>& dbias,
-                                      const tensor<U>& savedMean,
-                                      const tensor<U>& savedInvVar)
-{
-
-    int height, width, n_batch, channels;
-    std::tie(n_batch, channels, height, width) = miopen::tien<4>(x_input.desc.GetLengths());
-    int in_cstride                             = height * width;
-    auto n                                     = double(n_batch);
-
-    miopen::par_for(channels, 1, [&](int cidx) {
-        double elemStd = 0.;
-        unsigned int xhat_index;
-        double mean       = 0.;
-        double elemInvVar = 0.;
-        double dyelem     = 0.;
-        double dxhat      = 0.;
-        double dxhathat   = 0.;
-        double tmp1       = 0.;
-        std::vector<double> xhat(static_cast<std::size_t>(n_batch) * in_cstride);
-
-        // process the batch per channel
-        for(int row = 0; row < height; row++)
-        { // via rows
-            for(int column = 0; column < width; column++)
-            { // via columns
-                dxhat    = 0.;
-                dxhathat = 0.;
-
-                mean       = savedMean(0, cidx, row, column);   // HxW elements
-                elemInvVar = savedInvVar(0, cidx, row, column); // HxW elements
-
-                for(int bidx = 0; bidx < n_batch; bidx++)
-                { // via mini_batch
-                    xhat_index = in_cstride * bidx + (width * row + column);
-                    // per (x-dims) channel load a block of data into LDS
-                    elemStd = static_cast<double>(x_input(bidx, cidx, row, column)) -
-                              mean; // (x_i - mean)
-                    xhat[xhat_index] = elemStd * elemInvVar;
-                    double bnrefowd =
-                        scale(0, cidx, row, column) * xhat[xhat_index] + bias(0, cidx, row, column);
-                    activationHostBwdElement(activMode,
-                                             gamma,
-                                             beta,
-                                             alpha,
-                                             dy_input(bidx, cidx, row, column),
-                                             bnrefowd,
-                                             y_input(bidx, cidx, row, column),
-                                             dyelem);
-                    /*dyelem           = static_cast<double>(dy_input(bidx, cidx, row, column));*/
-                    dbias(0, cidx, row, column) += dyelem;
-                    dscale(0, cidx, row, column) += xhat[xhat_index] * dyelem;
-                    tmp1 = scale(0, cidx, row, column) * dyelem;
-                    dxhat += tmp1;
-                    dxhathat += tmp1 * xhat[xhat_index];
-
-                } // end for(n_batchs)
-
-                for(int bidx = 0; bidx < n_batch; bidx++)
-                { // via mini_batch
-                    xhat_index = in_cstride * bidx + (width * row + column);
-                    tmp1       = xhat[xhat_index] * dxhathat + dxhat;
-                    double bnrefowd =
-                        scale(0, cidx, row, column) * xhat[xhat_index] + bias(0, cidx, row, column);
-                    activationHostBwdElement(activMode,
-                                             gamma,
-                                             beta,
-                                             alpha,
-                                             dy_input(bidx, cidx, row, column),
-                                             bnrefowd,
-                                             y_input(bidx, cidx, row, column),
-                                             dyelem);
-                    double tmp2 = (n_batch * scale(0, cidx, row, column) * dyelem) - tmp1;
-                    double tmp3 = elemInvVar / (double(n));
-                    dx_out(bidx, cidx, row, column) = static_cast<T>(tmp3 * tmp2);
-                } // end for(n_batchs)
-            } // for (column)
-        } // for (row)
-    });
-}
-
-template <class F>
-void visitActivationHostInfer(
-    miopenActivationMode_t activMode, double gamma, double beta, double alpha, F f)
-{
-    switch(activMode)
-    {
-    case miopenActivationPASTHRU: //  x
-        f([=](double x) { return x; });
-        break;
-    case miopenActivationLOGISTIC: // 1 / (1 + e^-x)  //Sigmoid
-        f([=](double x) { return (1. / (1. + std::exp(-x))); });
-        break;
-    case miopenActivationTANH: // beta * tanh(alpha * x)
-        f([=](double x) { return (beta * std::tanh(alpha * x)); });
-        break;
-    case miopenActivationRELU: // max(0, x)
-        f([=](double x) { return ((x > 0.) ? x : 0.); });
-        break;
-    case miopenActivationSOFTRELU: //  log(1 + e^x)   // bonomial normal log likelihood
-        f([=](double x) {
-            return (x > 0.) ? (x + std::log1p(std::exp(-x))) : (std::log1p(std::exp(x)));
-        });
-        break;
-    case miopenActivationABS: //  abs(x)
-        f([=](double x) { return (std::fabs(x)); });
-        break;
-    case miopenActivationPOWER: // (alpha + beta * x) ^ gamma
-        f([=](double x) {
-            auto v = (alpha + beta * x);
-            return (v <= std::numeric_limits<double>::epsilon()) ? 0. : pow(v, gamma);
-        });
-        break;
-    case miopenActivationCLIPPEDRELU: // min(alpha, max(0, x))
-        f([=](double x) { return (std::min(alpha, std::max(double(0.), x))); });
-        break;
-    case miopenActivationLEAKYRELU: // alpha * x | x<=0; x | x>0
-        f([=](double x) { return ((x > 0.) ? x : x * alpha); });
-        break;
-    case miopenActivationELU: // alpha * (exp(x)-1) | x<=0; x | x>0
-        f([=](double x) { return ((x > 0.) ? x : alpha * std::expm1(x)); });
-        break;
-    case miopenActivationCLAMP: // max(alpha, min(beta, x))
-        f([=](double x) { return (std::max(alpha, std::min(beta, x))); });
-        break;
-        // default: printf("ERROR: unknown neuron type: %d\n", activMode); break;
-    }
-}
-
-template <class T>
-inline void activationHostInfer(miopenActivationMode_t activMode,
-                                double gamma,
-                                double beta,
-                                double alpha,
-                                const std::vector<T> input,
-                                std::vector<T>& output)
-{
-    visitActivationHostInfer(activMode, gamma, beta, alpha, [&](auto f) {
-        miopen::par_for(input.size(), 1, [&](int index) {
-            output[index] = static_cast<T>(f(static_cast<double>(input[index])));
-        });
-    });
-}
-
-template <class F>
-void visitActivationHostBwd(
-    miopenActivationMode_t activMode, double gamma, double beta, double alpha, F f)
-{
-    switch(activMode)
-    {
-    case miopenActivationPASTHRU: //  x
-        f([=](double dy, double, double) { return dy; });
-        break;
-    case miopenActivationLOGISTIC: // 1 / (1 + e^-x)  //Sigmoid
-        f([=](double dy, double, double y) { return dy * y * (1 - y); });
-        break;
-    case miopenActivationTANH: // beta * tanh(alpha * x)
-        f([=](double dy, double, double y) { return dy * alpha * (beta - y * y / beta); });
-        break;
-    case miopenActivationRELU: // max(0, x)
-        f([=](double dy, double x, double) { return (x > 0) ? dy : 0; });
-        break;
-    case miopenActivationSOFTRELU: //  log(1 + e^x)   // bonomial normal log likelihood
-        f([=](double dy, double x, double) {
-            static const double threshold = 50.;
-            double expval                 = std::exp(std::min(x, threshold));
-            return dy * expval / (expval + 1.0);
-        });
-        break;
-    case miopenActivationABS: //  abs(x)
-        f([=](double dy, double x, double) { return dy * ((x > 0) ? 1 : -1); });
-        break;
-    case miopenActivationPOWER: // (alpha + beta * x) ^ gamma
-        f([=](double, double x, double y) {
-            auto v = alpha + beta * x;
-            return v <= std::numeric_limits<double>::epsilon() ? 0 : gamma * beta * y / v;
-        });
-        break;
-    case miopenActivationCLIPPEDRELU: // min(alpha, max(0, x))
-        f([=](double dy, double x, double) { return (x > 0 && x <= alpha) ? dy : 0; });
-        break;
-    case miopenActivationLEAKYRELU: // alpha * x | x<=0; x | x>0
-        f([=](double dy, double x, double) { return dy * ((x > 0) ? 1 : alpha); });
-        break;
-    case miopenActivationELU: // alpah * (exp(x)-1) | x<=0; x | x>0
-        f([=](double dy, double x, double y) { return dy * ((x > 0) ? 1 : y + alpha); });
-        break;
-    case miopenActivationCLAMP: // max(alpha, min(beta, x))
-        f([=](double dy, double x, double) { return (x > alpha && x <= beta) ? dy : 0; });
-        break;
-        // default: printf("ERROR: unknown neuron type: %d\n", activMode); break;
-    }
-}
-
-template <class T, class U, class V>
-inline void activationHostBnormBwd(miopenActivationMode_t activMode,
-                                   double gamma,
-                                   double beta,
-                                   double alpha,
-                                   const std::vector<U> dyinput,
-                                   const std::vector<V> xinput,
-                                   std::vector<T>& output)
-{
-    double dummy;
-    visitActivationHostBwd(activMode, gamma, beta, alpha, [&](auto f) {
-        miopen::par_for(dyinput.size(), 1, [&](int index) {
-            output[index] = static_cast<T>(
-                f(static_cast<double>(dyinput[index]), static_cast<double>(xinput[index]), dummy));
-        });
-    });
-}
-
-template <class T>
-inline void activationHostBwd(miopenActivationMode_t activMode,
-                              double gamma,
-                              double beta,
-                              double alpha,
-                              const std::vector<T> dyinput,
-                              const std::vector<T> xinput,
-                              const std::vector<T> yinput,
-                              std::vector<T>& output)
-{
-    visitActivationHostBwd(activMode, gamma, beta, alpha, [&](auto f) {
-        miopen::par_for(dyinput.size(), 1, [&](int index) {
-            output[index] = static_cast<T>(f(static_cast<double>(dyinput[index]),
-                                             static_cast<double>(xinput[index]),
-                                             static_cast<double>(yinput[index])));
-        });
-    });
-}
-
-inline void activationHostBwdElement(miopenActivationMode_t activMode,
-                                     double gamma,
-                                     double beta,
-                                     double alpha,
-                                     const double dyinput,
-                                     const double xinput,
-                                     const double yinput,
-                                     double& output)
-{
-    visitActivationHostBwd(activMode, gamma, beta, alpha, [&](auto f) {
-        output = static_cast<double>(f(dyinput, xinput, yinput));
-    });
-}
-
-template <class T>
-tensor<T> get_output_tensor(const miopen::ConvolutionDescriptor& filter,
-                            const tensor<T>& input,
-                            const tensor<T>& weights)
-{
-    return tensor<T>{filter.GetForwardOutputTensor(input.desc, weights.desc, miopen_type<T>{})};
-}
+#include <miopen_utils/fusionHost.hpp>
diff --git a/projects/miopen/test/gemm.hpp b/projects/miopen/test/gemm.hpp
index 81c38db0fdf3..34fa7db11bec 100644
--- a/projects/miopen/test/gemm.hpp
+++ b/projects/miopen/test/gemm.hpp
@@ -1,120 +1,5 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2017 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
+// Forwarding header — implementation moved to miopen_utils.
 #ifndef GUARD_GEMM_HPP
 #define GUARD_GEMM_HPP
-
-#include <iostream>
-#include <miopen/ford.hpp>
-#include <miopen/errors.hpp>
-
-/*
-    A and B rows and cols should be passed as default values (NxM, MxK), independently of
-   a_transponse/b_transpose flag value
-    C rows and cols should have correct values based on a_transponse/b_transpose values
-    A, B, C strides should have corret values based on a_transponse/b_transpose values
-*/
-template <typename Dtype>
-void gemm_cpu(const Dtype* a_ptr,
-              const size_t a_cols,
-              const size_t a_rows,
-              const size_t a_stride,
-              const bool a_transpose,
-              const Dtype* b_ptr,
-              const size_t b_cols,
-              const size_t b_rows,
-              const size_t b_stride,
-              const bool b_transpose,
-              Dtype* c_ptr,
-              const size_t c_cols,
-              const size_t c_rows,
-              const size_t c_stride,
-              double alpha = 1.0,
-              double beta  = 1.0)
-{
-    if((!a_transpose && !b_transpose &&
-        ((a_cols != b_rows) || (a_rows != c_rows) || (b_cols != c_cols))) ||
-       (a_transpose && b_transpose &&
-        ((a_rows != b_cols) || (a_cols != c_rows) || (b_rows != c_cols))) ||
-       (a_transpose && !b_transpose &&
-        ((a_rows != b_rows) || (a_cols != c_rows) || (b_cols != c_cols))) ||
-       (!a_transpose && b_transpose &&
-        ((a_cols != b_cols) || (a_rows != c_rows) || (b_rows != c_cols))))
-    {
-        MIOPEN_THROW("MM_CPU_ERROR. Incompatible matrix size:\nA: " + std::to_string(a_rows) + "x" +
-                     std::to_string(a_cols) + " transpose: " + (a_transpose ? "true" : "false") +
-                     "\nB: " + std::to_string(b_rows) + "x" + std::to_string(b_cols) +
-                     " transpose: " + (b_transpose ? "true" : "false") +
-                     "\nC: " + std::to_string(c_rows) + "x" + std::to_string(c_cols) + "\n");
-    }
-
-    size_t inner_loop_limit = a_transpose ? a_rows : a_cols;
-    auto inner_loop         = [&](int m, int n) {
-        double el = 0.0;
-        if(!a_transpose && !b_transpose)
-        {
-            miopen::ford(inner_loop_limit)([&](int k) {
-                el += static_cast<double>(a_ptr[m * a_stride + k]) *
-                      static_cast<double>(b_ptr[k * b_stride + n]);
-            });
-        }
-        else if(!a_transpose && b_transpose)
-        {
-            miopen::ford(inner_loop_limit)([&](int k) {
-                el += static_cast<double>(a_ptr[m * a_stride + k]) *
-                      static_cast<double>(b_ptr[n * b_stride + k]);
-            });
-        }
-        else if(a_transpose && !b_transpose)
-        {
-            miopen::ford(inner_loop_limit)([&](int k) {
-                el += static_cast<double>(a_ptr[k * a_stride + m]) *
-                      static_cast<double>(b_ptr[k * b_stride + n]);
-            });
-        }
-        else
-        {
-            miopen::ford(inner_loop_limit)([&](int k) {
-                el += static_cast<double>(a_ptr[k * a_stride + m]) *
-                      static_cast<double>(b_ptr[n * b_stride + k]);
-            });
-        }
-
-        c_ptr[m * c_stride + n] =
-            static_cast<Dtype>(beta * static_cast<double>(c_ptr[m * c_stride + n]) + alpha * el);
-    };
-
-    constexpr size_t iter_margin = 1'048'576; // 2^20
-    if(c_rows * c_cols * inner_loop_limit > iter_margin)
-    {
-        miopen::par_ford(c_rows, c_cols)(inner_loop);
-    }
-    else
-    {
-        miopen::ford(c_rows, c_cols)(inner_loop);
-    }
-}
-
+#include <miopen_utils/gemm.hpp>
 #endif
diff --git a/projects/miopen/test/network_data.hpp b/projects/miopen/test/network_data.hpp
index 987d4dda9929..7a0dbcd702dd 100644
--- a/projects/miopen/test/network_data.hpp
+++ b/projects/miopen/test/network_data.hpp
@@ -1,438 +1,5 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2017 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-
+// Forwarding header — implementation moved to miopen_utils.
 #ifndef GUARD_MIOPEN_TEST_NETWORK_DATA_HPP
 #define GUARD_MIOPEN_TEST_NETWORK_DATA_HPP
-
-#include <initializer_list>
-#include <set>
-#include <vector>
-#include <type_traits>
-
-#ifndef MIOPEN_TEST_DEFAULT_BATCH_SIZE_FACTOR
-#define MIOPEN_TEST_DEFAULT_BATCH_SIZE_FACTOR 0
-#endif
-
-template <typename T = int>
-inline constexpr T pick_batch_size(T x, T y)
-{
-    return (y == 0 || y > x) ? 1 : x / y;
-}
-
-// Reduce tests execution time
-#define MIOPEN_TESTS_GET_INPUTS_ENABLE_HUGE_TENSORS 1
-
-template <typename T = int>
-inline std::set<std::vector<T>> get_inputs(T n = MIOPEN_TEST_DEFAULT_BATCH_SIZE_FACTOR)
-{
-    // clang-format off
-    return
-    {
-        { pick_batch_size<T>(32,  n), 1,    14,  14  },
-        { pick_batch_size<T>(100, n), 1,    8,   8   },
-        { pick_batch_size<T>(256, n), 1,    27,  27  },
-#if MIOPEN_TESTS_GET_INPUTS_ENABLE_HUGE_TENSORS
-        { pick_batch_size<T>(64, n), 19,   1024,2048},
-#endif
-        { pick_batch_size<T>(100, n), 3,    32,  32  },
-        { pick_batch_size<T>(100, n), 32,   16,  16  },
-        { pick_batch_size<T>(100, n), 32,   8,   8   },
-        { pick_batch_size<T>(128, n), 256,  12,  12  },
-        { pick_batch_size<T>(128, n), 3,    231, 231 },
-        { pick_batch_size<T>(128, n), 512,  12,  12  },
-        { pick_batch_size<T>(256, n), 256,  13,  13  },
-        { pick_batch_size<T>(256, n), 3,    227, 227 },
-        { pick_batch_size<T>(256, n), 384,  13,  13  },
-        { pick_batch_size<T>(256, n), 96,   27,  27  },
-        { pick_batch_size<T>(32, n),  128,  28,  28  },
-        { pick_batch_size<T>(32, n),  144,  14,  14  },
-        { pick_batch_size<T>(32, n),  192,  28,  28  },
-        { pick_batch_size<T>(32, n),  192,  7,   7   },
-        { pick_batch_size<T>(32, n),  256,  28,  28  },
-        { pick_batch_size<T>(32, n),  3,    224, 224 },
-        { pick_batch_size<T>(32, n),  32,   28,  28  },
-        { pick_batch_size<T>(32, n),  48,   7,   7   },
-        { pick_batch_size<T>(32, n),  480,  128, 256 },
-        { pick_batch_size<T>(32, n),  480,  64,  128 },
-        { pick_batch_size<T>(32, n),  512,  4,   4   },
-        { pick_batch_size<T>(32, n),  512,  64,  128 },
-        { pick_batch_size<T>(16, n),  64,   56,  56  },
-        { pick_batch_size<T>(32, n),  832,  7,   7   },
-        { pick_batch_size<T>(64, n),  128,  56,  56  },
-        { pick_batch_size<T>(64, n),  256,  28,  28  },
-        { pick_batch_size<T>(64, n),  3,    224, 224 },
-        { pick_batch_size<T>(64, n),  512,  28,  28  },
-        { pick_batch_size<T>(64, n),  64,   112, 112 },
-        { pick_batch_size<T>(32, n),  64,   14,  14  },
-        { pick_batch_size<T>(32, n),  192,  14,  14  },
-        { pick_batch_size<T>(32, n),  320,  28,  28  },
-        { pick_batch_size<T>(32, n),  576,  14,  14  },
-        { pick_batch_size<T>(32, n),  576,  4,   4   },
-        { pick_batch_size<T>(32, n),  1056, 7,   7   },
-        { pick_batch_size<T>(32, n),  2048, 11,  11  },
-#if MIOPEN_TESTS_GET_INPUTS_ENABLE_HUGE_TENSORS
-        { pick_batch_size<T>(32, n),  16,   2048, 2048 },
-        { pick_batch_size<T>(32, n),  16,   3072, 3072 },
-        { pick_batch_size<T>(32, n),  16,   4096, 4096 },
-#endif
-        { 1,                       1,    1,   1   }
-    };
-    // clang-format on
-}
-
-template <typename T = int>
-inline std::set<std::vector<T>> get_weights(T n = MIOPEN_TEST_DEFAULT_BATCH_SIZE_FACTOR)
-{
-    // clang-format off
-    return
-    {
-        { pick_batch_size(1024, n),1024, 3,  3  },
-        { pick_batch_size(1024, n),512,  3,  3  },
-        { pick_batch_size(128, n), 256,  1,  1  },
-        { pick_batch_size(128, n), 528,  1,  1  },
-        { pick_batch_size(128, n), 96,   3,  3  },
-        { pick_batch_size(16, n),  192,  1,  1  },
-        { pick_batch_size(224, n), 112,  3,  3  },
-        { pick_batch_size(256, n), 96,   5,  5  },
-        { pick_batch_size(288, n), 144,  3,  3  },
-        { pick_batch_size(48, n),  832,  1,  1  },
-        { pick_batch_size(512, n), 256,  3,  3  },
-        { pick_batch_size(64, n),  1,    2,  2  },
-        { pick_batch_size(64, n),  3,    3,  3  },
-        { pick_batch_size(64, n),  3,    7,  7  },
-        { pick_batch_size(64, n),  32,   5,  5  },
-        { pick_batch_size(64, n),  480,  1,  1  },
-        { pick_batch_size(64, n),  64,   1,  1  },
-        { pick_batch_size(96, n),  3,    11, 11 },
-        { pick_batch_size(192, n), 64,   5,  5  },
-        { pick_batch_size(64, n),  64,   3,  3  },
-        { pick_batch_size(224, n), 224,  3,  3  },
-        { pick_batch_size(224, n), 192,  3,  3  },
-        { pick_batch_size(128, n), 320,  1,  1  },
-        { pick_batch_size(192, n), 576,  1,  1  },
-        { pick_batch_size(128, n), 1056, 1,  1  },
-        { pick_batch_size(128, n), 1024, 1,  1  },
-        { pick_batch_size(512, n), 2048, 1,  1  }
-    };
-    // clang-format on
-}
-
-template <typename T = int>
-inline std::set<std::vector<T>> get_immed_inputs(T n = MIOPEN_TEST_DEFAULT_BATCH_SIZE_FACTOR)
-{
-    // clang-format off
-    return
-    {
-        { pick_batch_size(32,  n), 1,    14,  14  },
-        { pick_batch_size(256, n), 1,    27,  27  },
-        { pick_batch_size(128, n), 512,  12,  12  },
-        { pick_batch_size(256, n), 256,  13,  13  },
-        { pick_batch_size(256, n), 3,    227, 227 },
-        { pick_batch_size(32, n),  64,   56,  56  },
-        { pick_batch_size(32, n),  96,   14,  14  },
-        { pick_batch_size(32, n),  96,   28,  28  },
-        { pick_batch_size(64, n),  128,  56,  56  },
-        { pick_batch_size(64, n),  3,    224, 224 },
-        { pick_batch_size(64, n),  256,  14,  14  },
-        { 1,                       1,    1,   1   }
-    };
-    // clang-format on
-}
-
-template <typename T = int>
-inline std::set<std::vector<T>> get_immed_weights(T n = MIOPEN_TEST_DEFAULT_BATCH_SIZE_FACTOR)
-{
-    // clang-format off
-    return
-    {
-        { pick_batch_size(208, n), 96,   3,  3  },
-        { pick_batch_size(24, n),  512,  1,  1  },
-        { pick_batch_size(256, n), 128,  3,  3  },
-        { pick_batch_size(256, n), 256,  3,  3  },
-        { pick_batch_size(256, n), 64,   5,  5  },
-        { pick_batch_size(288, n), 144,  3,  3  },
-        { pick_batch_size(96, n),  3,    11, 11 },
-        { pick_batch_size(32, n),  128,   5,  5  },
-        { pick_batch_size(32, n),  128,  1,  1  },
-        { pick_batch_size(256, n), 256,  3,  3  },
-        { pick_batch_size(512, n), 512,  3,  3  },
-        { pick_batch_size(160, n), 128,  3,  3  },
-        { pick_batch_size(32, n),  3,    7,  7  }
-    };
-    // clang-format on
-}
-
-template <typename T = int>
-inline std::set<std::vector<T>>
-get_3d_conv_input_shapes(T n = MIOPEN_TEST_DEFAULT_BATCH_SIZE_FACTOR)
-{
-    // clang-format off
-    return
-    {
-        { pick_batch_size(128, n),   1,   1,   2,   2},
-        { pick_batch_size(128, n),  64,   1,   1,   1},
-        { pick_batch_size(128, n),  64,   3,   4,   4},
-        { pick_batch_size(352, n),  32,   4,   9,   9},
-        { pick_batch_size(192, n), 512,   3,  14,  14},
-        { pick_batch_size(352, n), 512,   4,  28,  28},
-        { pick_batch_size(256, n), 512,   4,  56,  56},
-        { pick_batch_size(192, n),   3,   4, 227, 227},
-        { pick_batch_size(128, n),   4,   4, 161, 700}
-    };
-    // clang-format on
-}
-
-template <typename T = int>
-inline std::set<std::vector<T>>
-get_3d_conv_weight_shapes(T n = MIOPEN_TEST_DEFAULT_BATCH_SIZE_FACTOR)
-{
-    // clang-format off
-    return
-    {
-        { pick_batch_size( 128, n),   1,   1,   1,   1},
-        { pick_batch_size( 352, n), 128,   1,   1,   1},
-        { pick_batch_size( 256, n), 128,   1,   1,   1},
-        { pick_batch_size( 352, n),  32,   3,   3,   3},
-        { pick_batch_size( 352, n),   4,   3,   3,   3},
-        { pick_batch_size( 160, n),   4,   3,   5,   5},
-        { pick_batch_size( 128, n),  64,   5,   7,   7},
-        { pick_batch_size( 192, n),   4,   3,  11,  11},
-        { pick_batch_size( 128, n),   1,   3,   1,   7},
-        { pick_batch_size( 128, n),   1,   3,   7,   1},
-        { pick_batch_size( 128, n),   1,   3,   5,  20}
-    };
-    // clang-format on
-}
-
-template <typename T = int>
-inline std::set<std::vector<T>> get_bn_peract_inputs(T n = MIOPEN_TEST_DEFAULT_BATCH_SIZE_FACTOR)
-{
-    // clang-format off
-    return
-    {
-        { pick_batch_size(32, n),  4,    1024,2048}, //Making this much smaller
-        { pick_batch_size(100, n), 3,    32,  32  },
-        { pick_batch_size(100, n), 32,   8,   8   },
-        { pick_batch_size(128, n), 256,  12,  12  },
-        { pick_batch_size(256, n), 3,    227, 227 },
-        { pick_batch_size(64, n),  64,   112, 112 },//Batch-norm ResNet 152 after this line
-        { pick_batch_size(256, n), 1024, 14,  14  },// n is from the paper @ 256
-        { pick_batch_size(256, n), 2048, 7,   7   },
-        { pick_batch_size(256, n), 256,  56,  56  },
-        { pick_batch_size(256, n), 256,  14,  14  },
-        { pick_batch_size(256, n), 512,  28,  28  },
-        { pick_batch_size(256, n), 512,  7,   7   },
-        { pick_batch_size(256, n), 64,   112, 112 },
-        { pick_batch_size(256, n), 64,   56,  56  },//Batch-norm Inception_v3 after this
-        { pick_batch_size(32, n),  1024, 1,   1   },// n is from the paper @ 32
-        { pick_batch_size(32, n),  128,  14,  14  },
-        { pick_batch_size(32, n),  128,  28,  28  },
-        { pick_batch_size(32, n),  128,  4,   4   },
-        { pick_batch_size(32, n),  128,  7,   7   },
-        { pick_batch_size(32, n),  160,  7,   7   },
-        { pick_batch_size(32, n),  192,  14,  14  },
-        { pick_batch_size(32, n),  192,  56,  56  },
-        { pick_batch_size(32, n),  192,  7,   7   },
-        { pick_batch_size(32, n),  224,  14,  14  },
-        { pick_batch_size(32, n),  256,  7,   7   },
-        { pick_batch_size(32, n),  256,  14,  14  },
-        { pick_batch_size(32, n),  352,  7,   7   },
-        { pick_batch_size(32, n),  64,   112, 112 },
-        { pick_batch_size(32, n),  64,   14,  14  },
-        { pick_batch_size(32, n),  64,   56,  56  },
-        { pick_batch_size(32, n),  96,   28,  28  },
-        { pick_batch_size(32, n),  32,  256,  512 }, //Killing this config. Takes way too long on the CPU
-        { pick_batch_size(32, n),  256,  28,  28  },
-        { pick_batch_size(32, n),  3,    224, 224 },
-        { pick_batch_size(32, n),  480,  128, 256 },
-        { pick_batch_size(32, n),  528,  64,  128 }
-    };
-    // clang-format on
-}
-
-template <typename T = int>
-inline std::set<std::vector<T>> get_bn_spatial_inputs(T n = MIOPEN_TEST_DEFAULT_BATCH_SIZE_FACTOR)
-{
-    // clang-format off
-    return
-    {
-        { pick_batch_size(32, n),  4,    1024,2048}, //Making this much smaller
-        { pick_batch_size(32, n),  192,  256, 512 },
-        { pick_batch_size(32, n),  480,  128, 256 },
-        { pick_batch_size(256, n), 3,    227, 227 },
-        { pick_batch_size(256, n), 64,   112, 112 },
-        { pick_batch_size(512, n), 16,   32,  32  },
-        { pick_batch_size(100, n), 32,   8,   8   },
-        { pick_batch_size(128, n), 256,  12,  12  },
-        { pick_batch_size(256, n), 128,  28,  28  },
-        { pick_batch_size(256, n), 2048, 7,   7   },
-        { pick_batch_size(256, n), 256,  56,  56  },
-        { pick_batch_size(256, n), 256,  14,  14  },
-        { pick_batch_size(256, n), 512,  28,  28  },
-        { pick_batch_size(256, n), 512,  7,   7   },
-        { pick_batch_size(256, n), 64,   56,  56  },//Batch-norm Inception_v3 after this
-        { pick_batch_size(32, n),  1024, 1,   1   },// n is from the paper @ 32
-        { pick_batch_size(32, n),  128,  14,  14  },
-        { pick_batch_size(32, n),  128,  4,   4   },
-        { pick_batch_size(32, n),  160,  7,   7   },
-        { pick_batch_size(32, n),  192,  14,  14  },
-        { pick_batch_size(32, n),  192,  56,  56  },
-        { pick_batch_size(32, n),  192,  7,   7   },
-        { pick_batch_size(32, n),  224,  14,  14  },
-        { pick_batch_size(32, n),  256,  7,   7   },
-        { pick_batch_size(32, n),  352,  7,   7   },
-        { pick_batch_size(32, n),  64,   14,  14  },
-        { pick_batch_size(32, n),  64,   28,  28  },
-        { pick_batch_size(32, n),  64,   56,  56  },
-        { pick_batch_size(32, n),  96,   28,  28  },
-        { pick_batch_size(32, n),  192,  256, 512 },
-        { pick_batch_size(32, n),  256,  28,  28  },
-        { pick_batch_size(32, n),  3,    224, 224 },
-        { pick_batch_size(32, n),  480,  128, 256 },
-        { pick_batch_size(32, n),  528,  64,  128 },
-        { pick_batch_size(770, n),  1,  8,  8 },
-        { pick_batch_size(770, n),  1024,  1,  1 },
-        { pick_batch_size(152, n),  128,  80,  80 },
-        { pick_batch_size(152, n),  256,  20,  20 },
-        { pick_batch_size(152, n),  32,  160,  160 },
-        { pick_batch_size(152, n),  512,  20,  20 },
-        { pick_batch_size(152, n),  64,  160,  160 },
-        { pick_batch_size(152, n),  64,  80,  80 },
-        { pick_batch_size(256, n),  256,  20,  20 },
-        { pick_batch_size(256, n),  512,  20,  20 }
-    };
-    // clang-format on
-}
-
-template <typename T = int>
-inline std::set<std::vector<T>> get_3d_bn_peract_inputs(T n = MIOPEN_TEST_DEFAULT_BATCH_SIZE_FACTOR)
-{
-    // clang-format off
-    return
-    {
-        { pick_batch_size(32, n),   1,   32,  32,  32  },       // 32x32x32 based on VoxNet arch
-        { pick_batch_size(32, n),   1,   14,  14,  14  },
-        { pick_batch_size(32, n),  32,   14,  14,  14  },
-        { pick_batch_size(32, n),  32,   12,  12,  12  },
-        { pick_batch_size(32, n),  32,    6,   6,   6  },
-        { pick_batch_size(256, n),  1,   32,  32,  32  },      // 32x32x32 based on VoxNet arch
-        { pick_batch_size(256, n), 32,   14,  14,  14  },
-        { pick_batch_size(256, n), 32,   12,  12,  12  },
-        { pick_batch_size(256, n), 32,    6,   6,   6  },
-        { pick_batch_size(512, n),  1,   32,  32,  32  },      // 32x32x32 based on VoxNet arch
-        { pick_batch_size(512, n), 32,   14,  14,  14  },
-        { pick_batch_size(512, n), 32,   12,  12,  12  },
-        { pick_batch_size(512, n), 32,    6,   6,   6  },
-        { pick_batch_size(32, n),   2,   32,  57, 125  },       // Hand-gesture recognition CVPR 2015 paper High Res Net Path
-        { pick_batch_size(32, n),  32,   14,  25,  59  },
-        { pick_batch_size(32, n),  32,    6,  10,  27  },
-        { pick_batch_size(32, n),  32,    4,   6,  11  },
-        { pick_batch_size(32, n),  32,    2,   2,   3  },
-        { pick_batch_size(32, n),  32,   32,  28,  62  },       // Hand-gesture recognition CVPR 2015 paper Low Res Net Path
-        { pick_batch_size(32, n),  32,   14,  12,  29  },
-        { pick_batch_size(32, n),  32,    6,   4,  12  },
-        { pick_batch_size(32, n),  32,    4,   2,   2  },
-        { pick_batch_size(16, n),  32,    6,  50,  50  },       // Multi-view 3D convnet
-        { pick_batch_size(1,  n),   3,    8, 240, 320  },      // 3D convet on video
-        { pick_batch_size(1,  n),   3,   16, 240, 320  },      // 3D convet on video
-        { pick_batch_size(1,  n),   3,    8, 128, 171  },      // 3D convet on video
-        { pick_batch_size(1,  n),   3,   16, 128, 171  },      // 3D convet on video
-        { pick_batch_size(1,  n),   3,    8, 112, 112  },      // 3D convet on video
-        { pick_batch_size(1,  n),   3,   16, 112, 112  }      // 3D convet on video
-    };
-
-    // clang-format on
-}
-
-template <typename T = int>
-inline std::set<std::vector<T>>
-get_3d_bn_spatial_inputs(T n = MIOPEN_TEST_DEFAULT_BATCH_SIZE_FACTOR)
-{
-    // clang-format off
-    return
-    {
-        { pick_batch_size(32, n),   1,   32,  32,  32  },       // 32x32x32 based on VoxNet arch
-        { pick_batch_size(32, n),   1,   14,  14,  14  },
-        { pick_batch_size(32, n),  32,   14,  14,  14  },
-        { pick_batch_size(32, n),  32,   12,  12,  12  },
-        { pick_batch_size(32, n),  32,    6,   6,   6  },
-        { pick_batch_size(256, n),  1,   32,  32,  32  },      // 32x32x32 based on VoxNet arch
-        { pick_batch_size(256, n), 32,   14,  14,  14  },
-        { pick_batch_size(256, n), 32,   12,  12,  12  },
-        { pick_batch_size(256, n), 32,    6,   6,   6  },
-        { pick_batch_size(512, n),  1,   32,  32,  32  },      // 32x32x32 based on VoxNet arch
-        { pick_batch_size(512, n), 32,   14,  14,  14  },
-        { pick_batch_size(512, n), 32,   12,  12,  12  },
-        { pick_batch_size(512, n), 32,    6,   6,   6  },
-        { pick_batch_size(32,  n),  2,   32,  57, 125  },       // Hand-gesture recognition CVPR 2015 paper High Res Net Path
-        { pick_batch_size(32,  n), 32,   14,  25,  59  },
-        { pick_batch_size(32,  n), 32,    6,  10,  27  },
-        { pick_batch_size(32,  n), 32,    4,   6,  11  },
-        { pick_batch_size(32,  n), 32,    2,   2,   3  },
-        { pick_batch_size(32,  n), 32,   32,  28,  62  },       // Hand-gesture recognition CVPR 2015 paper Low Res Net Path
-        { pick_batch_size(32,  n), 32,   14,  12,  29  },
-        { pick_batch_size(32,  n), 32,    6,   4,  12  },
-        { pick_batch_size(32,  n), 32,    4,   2,   2  },
-        { pick_batch_size(16,  n), 32,    6,  50,  50  },       // Multi-view 3D convnet
-        { pick_batch_size(1,   n), 3,     8,  240, 320 },      // 3D convet on video
-        { pick_batch_size(1,   n), 3,    16,  240, 320 },      // 3D convet on video
-        { pick_batch_size(1,   n), 3,     8,  128, 171 },      // 3D convet on video
-        { pick_batch_size(1,   n), 3,    16,  128, 171 },      // 3D convet on video
-        { pick_batch_size(1,   n), 3,     8,  112, 112 },      // 3D convet on video
-        { pick_batch_size(1,   n), 3,    16,  112, 112 }      // 3D convet on video
-    };
-    // clang-format on
-}
-
-template <typename T = int>
-inline std::vector<std::vector<T>> get_sub_tensor()
-{
-    return {{16, 4, 8, 1, 4},
-            {2, 4, 8, 8, 4},
-            {16, 4, 8, 4},
-            {13, 8, 4, 8},
-            {3, 8, 7},
-            {16, 4, 10},
-            {3, 8},
-            {16, 4},
-            {4}};
-}
-
-template <typename T = int>
-inline std::vector<std::vector<T>> get_tensor_offsets()
-{
-    static_assert(std::is_signed_v<T>);
-    return {{0, 0}, {0, 2}, {4, 0}, {5, 7}};
-}
-
-template <typename T = int>
-inline std::vector<T> get_tensor_offset()
-{
-    static_assert(std::is_signed_v<T>);
-    return {0, 1, 2, 3, 4, 5};
-}
-
+#include <miopen_utils/network_data.hpp>
 #endif
diff --git a/projects/miopen/test/random.hpp b/projects/miopen/test/random.hpp
index 62443abb1068..7c5c0efa5962 100644
--- a/projects/miopen/test/random.hpp
+++ b/projects/miopen/test/random.hpp
@@ -1,62 +1,5 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2021 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
+// Forwarding header — implementation moved to miopen_utils.
 #ifndef GUARD_MIOPEN_TEST_RANDOM_HPP
 #define GUARD_MIOPEN_TEST_RANDOM_HPP
-
-#include "../driver/random.hpp"
-
-namespace prng {
-template <typename T>
-inline T gen_descreet_uniform_sign(double scale, int32_t range)
-{
-    return static_cast<T>(scale * prng::gen_A_to_B(-range + 1, range));
-}
-
-template <typename T>
-inline T gen_descreet_unsigned(double scale, int32_t range)
-{
-    return static_cast<T>(scale * static_cast<double>(gen_0_to_B(range)));
-}
-
-} // namespace prng
-
-// lambda factory
-template <typename T, typename ScaleT, typename RangeT>
-auto uniform_signed_initializer(ScaleT scale_arg, RangeT range_arg)
-{
-    return [=](auto&&...) -> T {
-        // uniform sign give balance of both negative and positive values
-        return prng::gen_descreet_uniform_sign<T>(scale_arg, range_arg);
-    };
-}
-
-template <typename T, typename ScaleT, typename RangeT>
-auto uniform_unsigned_initializer(ScaleT scale_arg, RangeT range_arg)
-{
-    return [=](auto&&...) -> T { return prng::gen_descreet_unsigned<T>(scale_arg, range_arg); };
-}
-
-#endif // GUARD_MIOPEN_TEST_RANDOM_HPP
+#include <miopen_utils/random.hpp>
+#endif
diff --git a/projects/miopen/test/rnn_util.hpp b/projects/miopen/test/rnn_util.hpp
index d993d0df4c57..2a25f35e61a8 100644
--- a/projects/miopen/test/rnn_util.hpp
+++ b/projects/miopen/test/rnn_util.hpp
@@ -1,305 +1,5 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2023 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-
+// Forwarding header — implementation moved to miopen_utils.
 #ifndef MIOPEN_RNN_UTIL_H_
 #define MIOPEN_RNN_UTIL_H_
-
-#include <cfloat>
-#include <cmath>
-#include <initializer_list>
-#include <set>
-#include <vector>
-#include <cstdlib>
-#include <numeric>
-
-#include "gemm.hpp"
-#include "random.hpp"
-
-#include <miopen/tensor.hpp>
-
-// complexity O(NlogN)
-inline std::vector<int> GetReverseOrderIndex(const std::vector<int>& base_index)
-{
-    std::vector<int> reverse_index(base_index.size());
-    unsigned next_rev_index = 0;
-    for(auto id : base_index)
-        reverse_index[id] = next_rev_index++;
-    return reverse_index;
-};
-
-inline std::vector<int> GetSamplesIndexDescendingOrder(const std::vector<size_t>& unsorted_seq_lens)
-{
-    const auto sample_count = unsorted_seq_lens.size();
-
-    std::vector<int> index_v(sample_count);
-    std::iota(index_v.begin(), index_v.end(), 0);
-
-    auto seq_len_cmp = [&unsorted_seq_lens](unsigned a_id, unsigned b_id) {
-        return unsorted_seq_lens[a_id] > unsorted_seq_lens[b_id];
-    };
-
-    std::stable_sort(index_v.begin(), index_v.end(), seq_len_cmp);
-
-    return index_v;
-}
-
-template <typename Tgpu>
-inline void HiddenTensorReorder(const std::vector<Tgpu>& src_array,
-                                std::vector<Tgpu>& dst_array,
-                                const std::vector<int>& batch_order,
-                                const std::vector<size_t> hid_len,
-                                bool is_dst_direct_order)
-{
-    const size_t copy_size = hid_len[2];
-
-    const size_t batch_stride = hid_len[2];
-    const size_t layer_stride = batch_stride * hid_len[1];
-
-    for(size_t batch_id = 0; batch_id < hid_len[1]; batch_id++)
-    {
-        const auto src_batch_off =
-            batch_stride * (is_dst_direct_order ? batch_order[batch_id] : batch_id);
-        const auto dst_batch_off =
-            batch_stride * (is_dst_direct_order ? batch_id : batch_order[batch_id]);
-
-        for(size_t layer_id = 0; layer_id < hid_len[0]; layer_id++)
-        {
-            const auto dst_offset = dst_batch_off + layer_id * layer_stride;
-            const auto src_offset = src_batch_off + layer_id * layer_stride;
-
-            std::copy(src_array.begin() + src_offset,
-                      src_array.begin() + src_offset + copy_size,
-                      dst_array.begin() + dst_offset);
-        }
-    }
-}
-
-inline void createTensorDescArray(std::vector<miopen::TensorDescriptor>& td,
-                                  std::vector<miopenTensorDescriptor_t>& ptd,
-                                  const std::vector<int> bs,
-                                  const int secondDim,
-                                  miopenDataType_t dataType)
-{
-
-    std::transform(bs.begin(), bs.end(), std::back_inserter(td), [&](int x) {
-        return miopen::TensorDescriptor(
-            dataType, {static_cast<std::size_t>(x), static_cast<std::size_t>(secondDim)});
-    });
-    std::transform(td.begin(), td.end(), std::back_inserter(ptd), [](miopen::TensorDescriptor& x) {
-        return &x;
-    });
-}
-
-inline std::tuple<size_t, size_t>
-GetTempPackedBuffersSize(std::vector<int> batchs, int in_vec, int out_vec)
-{
-    size_t total_batch = std::accumulate(batchs.begin(), batchs.end(), 0ULL);
-
-    size_t in_buff_size  = total_batch * in_vec;
-    size_t out_buff_size = total_batch * out_vec;
-    return {in_buff_size, out_buff_size};
-}
-
-inline size_t getSuperTensorSize(const std::vector<int>& bs,
-                                 int seqLength,
-                                 int inputSize,
-                                 int hiddenSize,
-                                 int maxPaddingVal,
-                                 bool isBidirect,
-                                 bool isInput,
-                                 bool isPadded)
-{
-    return (isPadded //
-                ? static_cast<size_t>(seqLength) * maxPaddingVal
-                : std::accumulate(bs.begin(), bs.end(), 0ULL)) //
-           * (isInput                                          //
-                  ? static_cast<size_t>(inputSize)
-                  : static_cast<size_t>(hiddenSize) * (isBidirect ? 2 : 1));
-}
-
-template <typename Tgpu>
-void ChangeDataPadding(const std::vector<Tgpu>& src_array,
-                       std::vector<Tgpu>& dst_array,
-                       const std::vector<int>& batch_list,
-                       int max_batch,
-                       int sample_size,
-                       bool is_src_packed)
-{
-    auto seq_len = batch_list.size();
-
-    auto scr_ptr = &src_array[0];
-    auto dst_ptr = &dst_array[0];
-
-    for(int seq_id = 0; seq_id < seq_len; seq_id++)
-    {
-        auto packed_size = batch_list[seq_id] * sample_size;
-
-        std::copy(scr_ptr, scr_ptr + packed_size, dst_ptr);
-
-        if(is_src_packed)
-        {
-            dst_ptr += max_batch * sample_size;
-            scr_ptr += packed_size;
-        }
-        else
-        {
-            scr_ptr += max_batch * sample_size;
-            dst_ptr += packed_size;
-        }
-    }
-}
-
-// RNN VANILLA configs
-inline std::vector<int> get_rnn_num_layers() { return {{1, 3}}; }
-
-inline std::vector<int> get_rnn_batchSize() { return {{1, 17}}; }
-
-inline std::vector<int> get_rnn_seq_len() { return {{1, 3, 51}}; }
-
-inline std::vector<int> get_rnn_vector_len() { return {31}; }
-
-inline std::vector<int> get_rnn_hidden_size() { return {127}; }
-
-// LSTM configs
-inline std::vector<int> get_lstm_num_layers() { return {{1, 3}}; }
-
-inline std::vector<int> get_lstm_batchSize() { return {{1, 17}}; }
-
-inline std::vector<int> get_lstm_seq_len() { return {{1, 25}}; }
-
-inline std::vector<int> get_lstm_vector_len() { return {17}; }
-
-inline std::vector<int> get_lstm_hidden_size() { return {67}; }
-
-// GRU configs
-inline std::vector<int> get_gru_num_layers() { return {{1, 3}}; }
-
-inline std::vector<int> get_gru_batchSize() { return {{1, 17}}; }
-
-inline std::vector<int> get_gru_seq_len() { return {{1, 23}}; }
-
-inline std::vector<int> get_gru_vector_len() { return {13}; }
-
-inline std::vector<int> get_gru_hidden_size() { return {67}; }
-
-inline std::vector<std::vector<int>> generate_batchSeq(const int batchSize, const int seqLength)
-{
-
-    static constexpr int modval = 3;
-
-    int currentval = batchSize;
-    std::vector<int> batchSeq;
-    batchSeq.reserve(seqLength);
-    for(int i = 0; i < seqLength; i++)
-    {
-        if(i > 0)
-        {
-            int nvalue = currentval - prng::gen_0_to_B(modval);
-            currentval = (nvalue < 1) ? 1 : nvalue;
-            // printf("current value: %d\n", currentval);
-        }
-        // printf("adding a value to batch sequence: %d\n", currentval);
-        batchSeq.push_back(currentval);
-    }
-    return {batchSeq};
-}
-
-inline int sumvc(const std::vector<int>& x) { return std::accumulate(x.begin(), x.end(), 0); }
-
-template <typename T>
-inline T activfunc(T x, int actvf)
-{
-    T alpha = static_cast<T>(1), beta0 = static_cast<T>(0), beta1 = static_cast<T>(1);
-    if(actvf == 0)
-    {
-        return (x > 0) ? x : x * beta0;
-    }
-    else if(actvf == 2)
-    {
-        return static_cast<T>(1 / (1 + std::exp(-x)));
-    }
-    return static_cast<T>(alpha * std::tanh(beta1 * x));
-}
-
-template <typename T>
-inline T dervactivfunc(T x, int actvf)
-{
-    if(actvf == 0)
-    {
-        return static_cast<T>(x > 0 ? 1 : 0);
-    }
-    else if(actvf == 2)
-    {
-        return static_cast<T>(std::exp(-x) / (1 + std::exp(-x)) / (1 + std::exp(-x)));
-    }
-
-    return static_cast<T>(1 / std::cosh(x) / std::cosh(x));
-}
-
-template <typename Dtype>
-void RNN_mm_cpu_batched(const Dtype* a_ptr,
-                        size_t a_cols,
-                        size_t a_rows,
-                        size_t lda,
-                        size_t a_stride,
-                        int a_flags,
-                        const Dtype* b_ptr,
-                        size_t b_cols,
-                        size_t b_rows,
-                        size_t ldb,
-                        size_t b_stride,
-                        int b_flags,
-                        Dtype* c_ptr,
-                        size_t c_cols,
-                        size_t c_rows,
-                        size_t ldc,
-                        size_t c_stride,
-                        int batchCount,
-                        double alpha,
-                        double beta)
-{
-    for(int i = 0; i < batchCount; ++i)
-    {
-        gemm_cpu(a_ptr + a_stride * i,
-                 a_cols,
-                 a_rows,
-                 lda,
-                 a_flags == 1 ? true : false,
-                 b_ptr + b_stride * i,
-                 b_cols,
-                 b_rows,
-                 ldb,
-                 b_flags == 1 ? true : false,
-                 c_ptr + c_stride * i,
-                 c_cols,
-                 c_rows,
-                 ldc,
-                 alpha,
-                 beta);
-    }
-}
-
+#include <miopen_utils/rnn_util.hpp>
 #endif
diff --git a/projects/miopen/test/serialize.hpp b/projects/miopen/test/serialize.hpp
index 6b9b1b29632e..b9e948307a1e 100644
--- a/projects/miopen/test/serialize.hpp
+++ b/projects/miopen/test/serialize.hpp
@@ -1,129 +1,5 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2018 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-
+// Forwarding header — implementation moved to miopen_utils.
 #ifndef MIOPEN_GUARD_TEST_SERIALIZE_HPP
 #define MIOPEN_GUARD_TEST_SERIALIZE_HPP
-
-#include <miopen/rank.hpp>
-#include <miopen/each_args.hpp>
-#include <half/half.hpp>
-#include <fstream>
-#include <string>
-#include <tuple>
-#include <vector>
-
-template <class T>
-struct is_trivial_serializable : std::is_trivially_copy_constructible<T>
-{
-};
-
-template <>
-struct is_trivial_serializable<half_float::half> : std::true_type
-{
-};
-
-template <class T>
-std::enable_if_t<is_trivial_serializable<T>{}> serialize(std::ostream& os, const T& x)
-{
-    os.write(reinterpret_cast<const char*>(&x), sizeof(T));
-}
-
-template <class T>
-auto serialize(std::ostream& os,
-               const T& x) -> decltype(x.begin(), x.end(), T(x.begin(), x.end()), void())
-{
-    std::size_t n = std::distance(x.begin(), x.end());
-    serialize(os, n);
-    for(auto&& y : x)
-        serialize(os, y);
-}
-
-template <class... Ts>
-std::enable_if_t<not is_trivial_serializable<std::tuple<Ts...>>{}>
-serialize(std::ostream& os, const std::tuple<Ts...>& t)
-{
-    miopen::unpack(
-        [&](auto&&... xs) { miopen::each_args([&](auto&& x) { serialize(os, x); }, xs...); }, t);
-}
-
-template <class T>
-std::enable_if_t<is_trivial_serializable<T>{}> serialize(std::istream& is, T& x)
-{
-    is.read(reinterpret_cast<char*>(&x), sizeof(T));
-}
-
-template <class T>
-std::enable_if_t<is_trivial_serializable<T>{}> serialize(std::istream& is, std::vector<T>& x)
-{
-    std::size_t n;
-    serialize(is, n);
-    x.resize(n);
-    is.read(reinterpret_cast<char*>(x.data()), sizeof(T) * n);
-}
-
-template <class T>
-auto serialize(std::istream& is,
-               T& x) -> decltype(x.begin(), x.end(), x.assign(x.begin(), x.end()), void())
-{
-    using value_type = std::decay_t<decltype(*x.begin())>;
-    std::size_t n;
-    serialize(is, n);
-    std::vector<value_type> v;
-    v.reserve(n);
-    for(std::size_t i = 0; i < n; i++)
-    {
-        value_type y;
-        serialize(is, y);
-        v.push_back(y);
-    }
-    x.assign(v.begin(), v.end());
-}
-
-template <class... Ts>
-std::enable_if_t<not is_trivial_serializable<std::tuple<Ts...>>{}>
-serialize(std::istream& is,
-          // cppcheck-suppress constParameter
-          std::tuple<Ts...>& t)
-{
-    miopen::unpack(
-        [&](auto&&... xs) { miopen::each_args([&](auto&& x) { serialize(is, x); }, xs...); }, t);
-}
-
-template <class T>
-void load(std::string name, T& x)
-{
-    std::ifstream is{name.c_str()};
-    serialize(is, x);
-}
-
-template <class T>
-void save(std::string name, const T& x)
-{
-    std::ofstream os{name.c_str()};
-    serialize(os, x);
-}
-
+#include <miopen_utils/serialize.hpp>
 #endif
diff --git a/projects/miopen/test/tensor_holder.hpp b/projects/miopen/test/tensor_holder.hpp
index 64be2aa7c851..5f075eb9b528 100644
--- a/projects/miopen/test/tensor_holder.hpp
+++ b/projects/miopen/test/tensor_holder.hpp
@@ -1,505 +1,5 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2017 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
+// Forwarding header — implementation moved to miopen_utils.
 #ifndef GUARD_TENSOR_HOLDER_HPP
 #define GUARD_TENSOR_HOLDER_HPP
-
-#include "network_data.hpp"
-#include <miopen/ford.hpp>
-#include <miopen/tensor.hpp>
-#include <miopen/functional.hpp>
-#include <miopen/type_name.hpp>
-#include <miopen/each_args.hpp>
-#include <miopen/bfloat16.hpp>
-#include "../driver/random.hpp"
-
-#include "serialize.hpp"
-
-#include <half/half.hpp>
-using half         = half_float::half;
-using hip_bfloat16 = bfloat16;
-#include "../../src/kernels/hip_float8.hpp"
-using float8_fnuz  = miopen_f8::hip_f8<miopen_f8::hip_f8_type::fp8>;
-using bfloat8_fnuz = miopen_f8::hip_f8<miopen_f8::hip_f8_type::bf8>;
-
-#include <iomanip>
-#include <fstream>
-
-template <class F>
-void visit_tensor_size(std::size_t n, F f)
-{
-    switch(n)
-    {
-    case 0: {
-        f(std::integral_constant<std::size_t, 0>{});
-        break;
-    }
-    case 1: {
-        f(std::integral_constant<std::size_t, 1>{});
-        break;
-    }
-    case 2: {
-        f(std::integral_constant<std::size_t, 2>{});
-        break;
-    }
-    case 3: {
-        f(std::integral_constant<std::size_t, 3>{});
-        break;
-    }
-    case 4: {
-        f(std::integral_constant<std::size_t, 4>{});
-        break;
-    }
-    case 5: {
-        f(std::integral_constant<std::size_t, 5>{});
-        break;
-    }
-    default: throw std::runtime_error("Unknown tensor size");
-    }
-}
-
-template <class T>
-struct miopen_type;
-
-template <>
-struct miopen_type<float> : std::integral_constant<miopenDataType_t, miopenFloat>
-{
-};
-
-template <>
-struct miopen_type<double> : std::integral_constant<miopenDataType_t, miopenDouble>
-{
-};
-
-template <>
-struct miopen_type<half_float::half> : std::integral_constant<miopenDataType_t, miopenHalf>
-{
-};
-template <>
-struct miopen_type<bfloat16> : std::integral_constant<miopenDataType_t, miopenBFloat16>
-{
-};
-
-template <>
-struct miopen_type<int8_t> : std::integral_constant<miopenDataType_t, miopenInt8>
-{
-};
-
-template <>
-struct miopen_type<int> : std::integral_constant<miopenDataType_t, miopenInt32>
-{
-};
-
-template <>
-struct miopen_type<int64_t> : std::integral_constant<miopenDataType_t, miopenInt64>
-{
-};
-
-template <>
-struct miopen_type<float8_fnuz> : std::integral_constant<miopenDataType_t, miopenFloat8_fnuz>
-{
-};
-
-template <>
-struct miopen_type<bfloat8_fnuz> : std::integral_constant<miopenDataType_t, miopenBFloat8_fnuz>
-{
-};
-
-template <>
-struct miopen_type<uint8_t> : std::integral_constant<miopenDataType_t, miopenInt8>
-{
-};
-
-template <>
-struct miopen_type<uint16_t> : std::integral_constant<miopenDataType_t, miopenHalf>
-{
-};
-
-template <>
-struct miopen_type<uint64_t> : std::integral_constant<miopenDataType_t, miopenInt64>
-{
-};
-
-template <class T>
-struct tensor
-{
-    using value_type = T;
-    miopen::TensorDescriptor desc;
-    std::vector<T> data;
-
-#if defined(__clang__) || defined(__GNUG__)
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
-#endif
-
-    tensor() : desc(miopen_type<T>{}) {}
-
-#if defined(__clang__) || defined(__GNUG__)
-#pragma GCC diagnostic pop
-#endif
-
-    template <class X>
-    tensor(const std::vector<X>& dims) : desc(miopen_type<T>{}, dims), data(desc.GetElementSpace())
-    {
-    }
-
-    template <class X>
-    tensor(const std::vector<X>& dims, const std::vector<X>& strides)
-        : desc(miopen_type<T>{}, dims, strides), data(desc.GetElementSpace())
-    {
-        assert(dims.size() == strides.size());
-    }
-
-    template <class X>
-    tensor(miopenTensorLayout_t layout, const std::vector<X>& dims)
-        : desc(miopen_type<T>{}, layout, dims), data(desc.GetElementSpace())
-    {
-    }
-
-    template <class X>
-    tensor(miopenTensorLayout_t layout, const std::vector<X>& dims, const std::vector<X>& strides)
-        : desc(miopen_type<T>{}, layout, dims, strides), data(desc.GetElementSpace())
-    {
-        assert(dims.size() == strides.size());
-    }
-
-    tensor(std::size_t n, std::size_t c, std::size_t h, std::size_t w)
-        : desc(miopen_type<T>{}, {n, c, h, w}), data(n * c * h * w)
-    {
-    }
-
-    tensor(miopenTensorLayout_t layout, std::size_t n, std::size_t c, std::size_t h, std::size_t w)
-        : desc(miopen_type<T>{}, layout, {n, c, h, w}), data(desc.GetElementSpace())
-    {
-    }
-
-    tensor(std::size_t n, std::size_t c, std::size_t d, std::size_t h, std::size_t w)
-        : desc(miopen_type<T>{}, {n, c, d, h, w}), data(n * c * d * h * w)
-    {
-    }
-
-    tensor(std::size_t n) : desc(miopen_type<T>{}, {n}), data(n) {}
-
-    tensor(miopen::TensorDescriptor rhs) : desc(std::move(rhs))
-    {
-        assert(desc.GetType() == miopen_type<T>{}
-               /// In the driver, T is input tensor type, but output tensor holders
-               /// are instantiatied with T as well. This leads to false assertion
-               /// failures when T is INT8 because output type is different.
-               /// \todo Get rid of this hack when the driver is improved:
-               || (miopen_type<T>{} == miopenInt8 && desc.GetType() == miopenInt32));
-        data.resize(desc.GetElementSpace());
-    }
-
-    size_t GetDataByteSize() const { return GetSize() * sizeof(T); }
-
-    size_t GetSize() const { return desc.GetElementSpace(); }
-
-    template <class G>
-    tensor& generate(G g) &
-    {
-        if(this->desc.GetVectorLength() > 1)
-            this->generate_vect_impl(g);
-        else
-            this->generate_impl(g);
-        return *this;
-    }
-
-    template <class G>
-    tensor&& generate(G g) &&
-    {
-        if(this->desc.GetVectorLength() > 1)
-            this->generate_vect_impl(g);
-        else
-            this->generate_impl(g);
-        return std::move(*this);
-    }
-
-    template <class G>
-    void generate_impl(G g)
-    {
-        auto seed = std::accumulate(desc.GetLengths().begin(),
-                                    desc.GetLengths().end(),
-                                    std::size_t{521288629},
-                                    [](auto x, auto y) {
-                                        x ^= x << 1U;
-                                        return x ^ y;
-                                    });
-        seed ^= data.size();
-        seed ^= desc.GetLengths().size();
-        prng::reset_seed(seed);
-        auto iterator = data.begin();
-        auto assign   = [&](T x) {
-            *iterator = x;
-            ++iterator;
-        };
-        this->for_each(
-            miopen::compose(miopen::compose(assign, miopen::cast_to<T>()), std::move(g)));
-    }
-
-    template <class G>
-    void generate_vect_impl(G g)
-    {
-        auto seed = std::accumulate(desc.GetLengths().begin(),
-                                    desc.GetLengths().end(),
-                                    std::size_t{521288629},
-                                    [](auto x, auto y) {
-                                        x ^= x << 1U;
-                                        return x ^ y;
-                                    });
-        seed ^= data.size();
-        seed ^= desc.GetLengths().size();
-        prng::reset_seed(seed);
-        auto iterator     = data.begin();
-        auto vectorLength = desc.GetVectorLength();
-        auto assign       = [&](T x) {
-            assert(iterator < data.end());
-            // for debugging
-            for(auto i = 0; i < vectorLength; i++)
-            {
-                *(iterator + i) = x;
-            }
-            iterator += vectorLength;
-        };
-        this->for_each(
-            miopen::compose(miopen::compose(assign, miopen::cast_to<T>()), std::move(g)));
-    }
-
-    template <class Loop, class F>
-    struct for_each_unpacked
-    {
-        Loop loop;
-        F f;
-        template <class... Ts>
-        auto operator()(Ts... xs) const -> decltype(f(xs...), void())
-        {
-            loop(xs...)(std::move(f));
-        }
-
-        struct any
-        {
-            any() {}
-            template <class X>
-            any(X)
-            {
-            }
-        };
-
-        [[noreturn]] void operator()(any = {},
-                                     any = {},
-                                     any = {},
-                                     any = {},
-                                     any = {},
-                                     any = {},
-                                     any = {},
-                                     any = {},
-                                     any = {}) const
-        {
-            throw std::runtime_error(
-                "Arguments to for_each do not match tensor size or the function " +
-                miopen::get_type_name<F>() + " can not be called.");
-        }
-    };
-
-    struct for_each_handler
-    {
-        template <class Self, class Loop, class F, class Size>
-        void operator()(Self* self, Loop loop, F f, Size size) const
-        {
-            auto dims = miopen::tien<size>(self->desc.GetLengths());
-            miopen::unpack(for_each_unpacked<Loop, F>{loop, std::move(f)}, dims);
-        }
-    };
-
-    template <class F>
-    void for_each(F f) const
-    {
-        visit_tensor_size(
-            desc.GetLengths().size(),
-            std::bind(for_each_handler{}, this, miopen::ford, std::move(f), std::placeholders::_1));
-    }
-
-    template <class F>
-    void par_for_each(F f) const
-    {
-        visit_tensor_size(
-            desc.GetLengths().size(),
-            std::bind(
-                for_each_handler{}, this, miopen::par_ford, std::move(f), std::placeholders::_1));
-    }
-
-    template <class... Ts>
-    T& operator()(Ts... xs)
-    {
-        assert(this->desc.GetIndex(xs...) < data.size());
-        return this->data[this->desc.GetIndex(xs...)];
-    }
-
-    template <class... Ts>
-    const T& operator()(Ts... xs) const
-    {
-        assert(this->desc.GetIndex(xs...) < data.size());
-        return this->data[this->desc.GetIndex(xs...)];
-    }
-
-    template <class Integer, Integer N>
-    const T& operator()(const std::array<Integer, N>& multi_id) const
-    {
-        auto f = [&](auto... is) { return this->desc.GetIndex(is...); };
-        assert(miopen::unpack(f, multi_id) < data.size());
-        return this->data[miopen::unpack(f, multi_id)];
-    }
-
-    T& operator[](std::size_t i) { return data.at(i); }
-
-    const T& operator[](std::size_t i) const { return data.at(i); }
-
-    typename std::vector<T>::iterator begin() { return data.begin(); }
-
-    typename std::vector<T>::iterator end() { return data.end(); }
-
-    typename std::vector<T>::const_iterator begin() const { return data.begin(); }
-
-    typename std::vector<T>::const_iterator end() const { return data.end(); }
-
-    friend std::ostream& operator<<(std::ostream& stream, const tensor& t)
-    {
-        return stream << t.desc;
-    }
-
-    template <size_t N, typename Stream>
-    void dump_inner(size_t dim, std::array<size_t, N>& coord, Stream& stream) const
-    {
-        const auto lengths = this->desc.GetLengths();
-        if(lengths.size() == 0)
-        {
-            // 0D special case: Just print the one value that we have and return.
-            stream << (*this)(coord);
-        }
-        else if(dim + 1 == lengths.size())
-        {
-            // 1D special case: dump everything on one line
-            for(size_t i = 0; i < lengths[dim]; ++i)
-            {
-                if(i != 0)
-                    stream << ' ';
-
-                coord[dim] = i;
-                stream << std::setw(4) << (*this)(coord);
-            }
-
-            stream << '\n';
-        }
-        else
-        {
-            if(dim + 2 == lengths.size())
-            {
-                // 2D special case: Also print which 2D slice we are currently printing
-                // Note: this is not needed for higher dimensions, as they will also pass
-                // through this branch.
-                stream << "slice [";
-                for(size_t i = 0; i < dim; ++i)
-                {
-                    stream << coord[i] << ", ";
-                }
-                stream << ":, :]\n";
-            }
-
-            for(size_t i = 0; i < lengths[dim]; ++i)
-            {
-                coord[dim] = i;
-                this->dump_inner<N>(dim + 1, coord, stream);
-            }
-        }
-    }
-
-    template <typename Stream = decltype(std::cout)>
-    void dump(const char* name, Stream& stream = std::cout) const
-    {
-        const auto n = this->desc.GetLengths().size();
-        stream << "==== " << name << ": " << *this << n << '\n';
-        stream.fill(' ');
-
-        const auto flags = stream.flags();
-
-        visit_tensor_size(n, [&](const auto size) {
-            constexpr size_t N = decltype(size)::value;
-            std::array<size_t, N> coord;
-            this->dump_inner<N>(0, coord, stream);
-        });
-
-        stream.flags(flags);
-    }
-};
-
-template <class T>
-void serialize(std::istream& s, tensor<T>& x)
-{
-    std::vector<std::size_t> lens;
-    serialize(s, lens);
-    std::vector<std::size_t> strides;
-    serialize(s, strides);
-    x.desc = miopen::TensorDescriptor{miopen_type<T>{}, lens, strides};
-    serialize(s, x.data);
-}
-
-template <class T>
-void serialize(std::ostream& s, const tensor<T>& x)
-{
-    const auto& lens    = x.desc.GetLengths();
-    const auto& strides = x.desc.GetStrides();
-    serialize(s, lens);
-    serialize(s, strides);
-    serialize(s, x.data);
-}
-
-struct tensor_generate
-{
-    template <class Tensor, class G>
-    Tensor&& operator()(Tensor&& t, G g) const
-    {
-        return std::forward<Tensor>(t.generate(g));
-    }
-};
-
-struct tensor_elem_gen_integer
-{
-    uint64_t max_value = 17;
-
-    template <class... Ts>
-    double operator()(Ts... Xs) const
-    {
-        static_assert(sizeof...(Ts) < 6,
-                      "Dimensions in tensor_elem_gen_integer must be less than 6.");
-        assert(max_value > 0);
-        std::array<uint64_t, sizeof...(Ts)> left = {{Xs...}};
-        std::array<uint64_t, 5> right            = {{613, 547, 701, 877, 1049}};
-        uint64_t dot =
-            std::inner_product(left.begin(), left.end(), right.begin(), static_cast<uint64_t>(173));
-        return static_cast<double>(dot % max_value);
-    }
-};
-
+#include <miopen_utils/tensor_holder.hpp>
 #endif
diff --git a/projects/miopen/test/verify.hpp b/projects/miopen/test/verify.hpp
index 1d7d9cf80a50..2bf12f1057a3 100644
--- a/projects/miopen/test/verify.hpp
+++ b/projects/miopen/test/verify.hpp
@@ -1,245 +1,5 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2017 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
+// Forwarding header — implementation moved to miopen_utils.
 #ifndef GUARD_VERIFY_HPP
 #define GUARD_VERIFY_HPP
-
-#include <algorithm>
-#include <cmath>
-#include <functional>
-#include <iostream>
-#include <miopen/float_equal.hpp>
-#include <miopen/returns.hpp>
-#include <numeric>
-#include <miopen/bfloat16.hpp>
-using half         = half_float::half;
-using hip_bfloat16 = bfloat16;
-#include <hip_float8.hpp>
-#include "tensor_holder.hpp"
-
-namespace miopen {
-
-// Compute the value of a range
-template <class R>
-using range_value = typename std::decay<decltype(*std::declval<R>().begin())>::type;
-
-struct sum_fn
-{
-    template <class T, class U>
-    auto operator()(T x, U y) const MIOPEN_RETURNS(x + y);
-};
-static constexpr sum_fn sum{};
-
-struct max_fn
-{
-    template <class T>
-    static T id(T x)
-    {
-        return x;
-    }
-
-    template <class T, class U>
-    auto operator()(T x, U y) const MIOPEN_RETURNS(max_fn::id(x > y ? x : y));
-};
-static constexpr max_fn max{};
-
-namespace abs_diff_detail {
-using std::fabs;
-struct fn
-{
-    template <class T, class U>
-    auto operator()(T x, U y) const MIOPEN_RETURNS(fabs(x - y));
-};
-
-} // namespace abs_diff_detail
-
-static constexpr abs_diff_detail::fn abs_diff{};
-
-struct not_finite_fn
-{
-    template <class T, typename std::enable_if<(std::is_floating_point_v<T>), bool>::type = false>
-    bool operator()(T x) const
-    {
-        return !std::isfinite(x);
-    }
-
-    template <class T,
-              typename std::enable_if<
-                  (std::is_same_v<typename std::remove_cv<T>::type, half_float::half>),
-                  bool>::type = false>
-    bool operator()(T x) const
-    {
-        return !half_float::isfinite(x);
-    }
-
-    template <class T,
-              typename std::enable_if<(std::is_same_v<typename std::remove_cv<T>::type, bfloat16>),
-                                      bool>::type = false>
-    bool operator()(T x) const
-    {
-        return !std::isfinite(x); // bfloat16 has float() conversion operator
-    }
-
-    template <class T, typename std::enable_if<(std::is_integral_v<T>), bool>::type = false>
-    bool operator()(T x) const
-    {
-        std::ignore = x;
-        return false;
-    }
-};
-static constexpr not_finite_fn not_finite{};
-
-template <class T, class U>
-T as(T, U x)
-{
-    return x;
-}
-
-struct compare_mag_fn
-{
-    template <class T, class U>
-    bool operator()(T x, U y) const
-    {
-        using std::fabs;
-        return fabs(x) < fabs(y);
-    }
-};
-static constexpr compare_mag_fn compare_mag{};
-
-struct square_diff_fn
-{
-    template <class T, class U>
-    double operator()(T x, U y) const
-    {
-        double diff = static_cast<double>(x - y);
-        return diff * diff;
-    }
-};
-static constexpr square_diff_fn square_diff{};
-
-template <class T, std::enable_if_t<!std::is_floating_point_v<T>, bool> = true>
-bool equal_values(T const& lhs, T const& rhs)
-{
-    return lhs == rhs;
-}
-
-template <class T, std::enable_if_t<std::is_floating_point_v<T>, bool> = true>
-bool equal_values(T const& lhs, T const& rhs)
-{
-    return miopen::float_equal_sentinel(lhs, rhs);
-}
-
-template <class R1>
-bool range_empty(R1&& r1)
-{
-    return r1.begin() == r1.end();
-}
-
-template <class R1>
-auto range_distance(R1&& r1) MIOPEN_RETURNS(std::distance(r1.begin(), r1.end()));
-
-template <class T>
-bool range_zero(const std::vector<T>& r)
-{
-    return std::all_of(r.begin(), r.end(), [](T x) { return equal_values(x, T()); });
-}
-
-template <class T>
-bool range_zero(const tensor<T>& r)
-{
-    return range_zero(r.data);
-}
-
-template <class R1, class R2, class T, class Reducer, class Product>
-T range_product(R1&& r1, R2&& r2, T state, Reducer r, Product p)
-{
-    return std::inner_product(r1.begin(), r1.end(), r2.begin(), state, r, p);
-}
-
-template <class R1, class R2, class Compare>
-std::size_t mismatch_idx(R1&& r1, R2&& r2, Compare compare)
-{
-    auto p = std::mismatch(r1.begin(), r1.end(), r2.begin(), compare);
-    return std::distance(r1.begin(), p.first);
-}
-
-template <class R1, class Predicate>
-int64_t find_idx(R1&& r1, Predicate p)
-{
-    auto it = std::find_if(r1.begin(), r1.end(), p);
-    if(it == r1.end())
-        return -1;
-    else
-        return std::distance(r1.begin(), it);
-}
-
-template <class R1, class R2>
-double max_diff(R1&& r1, R2&& r2)
-{
-    return range_product(r1, r2, 0.0, max, abs_diff);
-}
-
-template <class R1, class R2>
-auto max_diff_v2(R1&& r1, R2&& r2)
-{
-    using T            = decltype(r1[0] - r2[0]);
-    auto abs_diff_func = [](auto x, auto y) { return x > y ? x - y : y - x; };
-    // BUG: deduced wrong datatype, half_float bug
-    if constexpr(std::is_same_v<T, half_float::detail::expr>)
-        return range_product(r1, r2, half_float::half(), max, abs_diff_func);
-    else
-        return range_product(r1, r2, T(), max, abs_diff_func);
-}
-
-template <class R1, class R2, class T>
-std::size_t mismatch_diff(R1&& r1, R2&& r2, T diff)
-{
-    return mismatch_idx(
-        r1,
-        r2,
-        std::bind(
-            float_equal, diff, std::bind(abs_diff, std::placeholders::_1, std::placeholders::_2)));
-}
-
-template <class R1, class R2>
-double rms_range(R1&& r1, R2&& r2)
-{
-    std::size_t n = range_distance(r1);
-    if(n == range_distance(r2))
-    {
-        if(n == 0)
-            return 0;
-        double square_difference = range_product(r1, r2, 0.0, sum_fn{}, square_diff);
-        double mag1 = static_cast<double>(*std::max_element(r1.begin(), r1.end(), compare_mag));
-        double mag2 = static_cast<double>(*std::max_element(r2.begin(), r2.end(), compare_mag));
-        double mag =
-            std::max({std::fabs(mag1), std::fabs(mag2), std::numeric_limits<double>::min()});
-        return std::sqrt(square_difference) / (std::sqrt(n) * mag);
-    }
-    else
-        return double(std::numeric_limits<range_value<R1>>::max());
-}
-} // namespace miopen
+#include <miopen_utils/verify.hpp>
 #endif

From 9d8ed0aed7ef456e693e407400c057d395d9aa4b Mon Sep 17 00:00:00 2001
From: Brad Pepers <brad.pepers@amd.com>
Date: Sat, 9 May 2026 07:40:07 -0600
Subject: [PATCH 04/11] Remove unnecessary test includes of driver headers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Remove 15 unused #include directives where test files included driver
headers without using any symbols from them:
- 14 files included driver/tensor_driver.hpp unnecessarily
- 1 file included driver/conv_common.hpp unnecessarily

Remaining test→driver cross-references (3 files, all legitimate):
- softmax_find20.cpp → mloSoftmaxHost.hpp (CPU reference, move later)
- find_mode_trust_verify.cpp → driver.hpp (uses GPUMem)
- kernel_tuning_net.cpp → driver.hpp (uses GPUMem)

Co-Authored-By: Claude Sonnet 4 <noreply@anthropic.com>
---
 projects/miopen/test/gtest/adam.hpp                             | 1 -
 projects/miopen/test/gtest/addlayernorm.hpp                     | 1 -
 projects/miopen/test/gtest/cat.hpp                              | 1 -
 projects/miopen/test/gtest/conv3d_test_case.hpp                 | 1 -
 projects/miopen/test/gtest/getitem.hpp                          | 1 -
 projects/miopen/test/gtest/group_conv.hpp                       | 1 -
 projects/miopen/test/gtest/group_conv_deterministic_split_k.cpp | 1 -
 projects/miopen/test/gtest/groupnorm.hpp                        | 1 -
 projects/miopen/test/gtest/kthvalue.hpp                         | 1 -
 projects/miopen/test/gtest/layout_transpose.cpp                 | 1 -
 projects/miopen/test/gtest/reducecalculation.hpp                | 1 -
 projects/miopen/test/gtest/reduceextreme.hpp                    | 1 -
 projects/miopen/test/gtest/rope.hpp                             | 1 -
 projects/miopen/test/gtest/t5layernorm.hpp                      | 1 -
 projects/miopen/test/gtest/transformers_adam_w.hpp              | 1 -
 15 files changed, 15 deletions(-)

diff --git a/projects/miopen/test/gtest/adam.hpp b/projects/miopen/test/gtest/adam.hpp
index 0efd9b390765..e54ddd1fc85d 100644
--- a/projects/miopen/test/gtest/adam.hpp
+++ b/projects/miopen/test/gtest/adam.hpp
@@ -24,7 +24,6 @@
  *
  *******************************************************************************/
 #define MIOPEN_BETA_API 1
-#include "../driver/tensor_driver.hpp"
 #include "cpu_adam.hpp"
 #include "get_handle.hpp"
 #include "random.hpp"
diff --git a/projects/miopen/test/gtest/addlayernorm.hpp b/projects/miopen/test/gtest/addlayernorm.hpp
index 0eba1588058d..511882710ff8 100644
--- a/projects/miopen/test/gtest/addlayernorm.hpp
+++ b/projects/miopen/test/gtest/addlayernorm.hpp
@@ -24,7 +24,6 @@
  *
  *******************************************************************************/
 
-#include "../driver/tensor_driver.hpp"
 #include "get_handle.hpp"
 #include "random.hpp"
 #include "tensor_holder.hpp"
diff --git a/projects/miopen/test/gtest/cat.hpp b/projects/miopen/test/gtest/cat.hpp
index 8d5fb109e0ea..bf29ccc7bcb0 100644
--- a/projects/miopen/test/gtest/cat.hpp
+++ b/projects/miopen/test/gtest/cat.hpp
@@ -2,7 +2,6 @@
 // SPDX-License-Identifier:  MIT
 
 #define MIOPEN_BETA_API 1
-#include "../driver/tensor_driver.hpp"
 #include "cpu_cat.hpp"
 #include "get_handle.hpp"
 #include "random.hpp"
diff --git a/projects/miopen/test/gtest/conv3d_test_case.hpp b/projects/miopen/test/gtest/conv3d_test_case.hpp
index a10c1809cacf..d9a061941703 100644
--- a/projects/miopen/test/gtest/conv3d_test_case.hpp
+++ b/projects/miopen/test/gtest/conv3d_test_case.hpp
@@ -30,7 +30,6 @@
 #include "get_handle.hpp"
 #include <miopen/conv/data_invoke_params.hpp>
 
-#include "../driver/tensor_driver.hpp"
 #include "conv_common.hpp"
 #include "conv_test_base.hpp"
 
diff --git a/projects/miopen/test/gtest/getitem.hpp b/projects/miopen/test/gtest/getitem.hpp
index 22c98ca67b99..8889b1d3d457 100644
--- a/projects/miopen/test/gtest/getitem.hpp
+++ b/projects/miopen/test/gtest/getitem.hpp
@@ -24,7 +24,6 @@
  *
  *******************************************************************************/
 
-#include "../driver/tensor_driver.hpp"
 #include "get_handle.hpp"
 #include "random.hpp"
 #include "tensor_holder.hpp"
diff --git a/projects/miopen/test/gtest/group_conv.hpp b/projects/miopen/test/gtest/group_conv.hpp
index d9ab9e080898..8acdd56548e2 100644
--- a/projects/miopen/test/gtest/group_conv.hpp
+++ b/projects/miopen/test/gtest/group_conv.hpp
@@ -32,7 +32,6 @@
 #include <miopen/conv/solvers.hpp>
 #include <miopen/conv/wrw_invoke_params.hpp>
 
-#include "../driver/tensor_driver.hpp"
 #include "conv_common.hpp"
 #include "gtest_common.hpp"
 
diff --git a/projects/miopen/test/gtest/group_conv_deterministic_split_k.cpp b/projects/miopen/test/gtest/group_conv_deterministic_split_k.cpp
index 3e141b72057e..7f9c62901733 100644
--- a/projects/miopen/test/gtest/group_conv_deterministic_split_k.cpp
+++ b/projects/miopen/test/gtest/group_conv_deterministic_split_k.cpp
@@ -30,7 +30,6 @@
 #include <miopen/conv/wrw_invoke_params.hpp>
 #include "../random.hpp"
 #include "get_handle.hpp"
-#include "../driver/tensor_driver.hpp"
 #include "conv_common.hpp"
 #include "gtest_common.hpp"
 
diff --git a/projects/miopen/test/gtest/groupnorm.hpp b/projects/miopen/test/gtest/groupnorm.hpp
index 33c4ed105f59..e28c5b652605 100644
--- a/projects/miopen/test/gtest/groupnorm.hpp
+++ b/projects/miopen/test/gtest/groupnorm.hpp
@@ -31,7 +31,6 @@
 #include "cpu_groupnorm.hpp"
 #include "get_handle.hpp"
 #include "random.hpp"
-#include "../driver/tensor_driver.hpp"
 #include "verify.hpp"
 #include <random>
 
diff --git a/projects/miopen/test/gtest/kthvalue.hpp b/projects/miopen/test/gtest/kthvalue.hpp
index 2aa7e6fd41d1..58d7db388419 100644
--- a/projects/miopen/test/gtest/kthvalue.hpp
+++ b/projects/miopen/test/gtest/kthvalue.hpp
@@ -23,7 +23,6 @@
  * SOFTWARE.
  *
  *******************************************************************************/
-#include "../driver/tensor_driver.hpp"
 #include "cpu_kthvalue.hpp"
 #include "get_handle.hpp"
 
diff --git a/projects/miopen/test/gtest/layout_transpose.cpp b/projects/miopen/test/gtest/layout_transpose.cpp
index f67c7a0387de..b4c86a99846a 100644
--- a/projects/miopen/test/gtest/layout_transpose.cpp
+++ b/projects/miopen/test/gtest/layout_transpose.cpp
@@ -25,7 +25,6 @@
  *******************************************************************************/
 #include <gtest/gtest.h>
 
-#include "../../driver/conv_common.hpp"
 #include <miopen/batched_transpose_sol.hpp>
 #include <miopen/handle.hpp>
 #include <miopen/invoker.hpp>
diff --git a/projects/miopen/test/gtest/reducecalculation.hpp b/projects/miopen/test/gtest/reducecalculation.hpp
index 2f2867423d5f..94b70ac8a1ea 100644
--- a/projects/miopen/test/gtest/reducecalculation.hpp
+++ b/projects/miopen/test/gtest/reducecalculation.hpp
@@ -24,7 +24,6 @@
  *
  *******************************************************************************/
 
-#include "../driver/tensor_driver.hpp"
 #include "../src/kernels/MIOpenReduceCalculation.hpp"
 #include "get_handle.hpp"
 #include "random.hpp"
diff --git a/projects/miopen/test/gtest/reduceextreme.hpp b/projects/miopen/test/gtest/reduceextreme.hpp
index f884bb8fc5cf..4d2658a39569 100644
--- a/projects/miopen/test/gtest/reduceextreme.hpp
+++ b/projects/miopen/test/gtest/reduceextreme.hpp
@@ -24,7 +24,6 @@
  *
  *******************************************************************************/
 
-#include "../driver/tensor_driver.hpp"
 #include "../src/kernels/MIOpenReduceExtreme.hpp"
 #include "get_handle.hpp"
 #include "random.hpp"
diff --git a/projects/miopen/test/gtest/rope.hpp b/projects/miopen/test/gtest/rope.hpp
index 8c8dd2ed2b3d..109ff0549978 100644
--- a/projects/miopen/test/gtest/rope.hpp
+++ b/projects/miopen/test/gtest/rope.hpp
@@ -24,7 +24,6 @@
  *
  *******************************************************************************/
 
-#include "../driver/tensor_driver.hpp"
 #include "get_handle.hpp"
 #include "random.hpp"
 #include "tensor_holder.hpp"
diff --git a/projects/miopen/test/gtest/t5layernorm.hpp b/projects/miopen/test/gtest/t5layernorm.hpp
index 1ee2f2bd6ebe..e71819273683 100644
--- a/projects/miopen/test/gtest/t5layernorm.hpp
+++ b/projects/miopen/test/gtest/t5layernorm.hpp
@@ -24,7 +24,6 @@
  *
  *******************************************************************************/
 
-#include "../driver/tensor_driver.hpp"
 #include "get_handle.hpp"
 #include "random.hpp"
 #include "tensor_holder.hpp"
diff --git a/projects/miopen/test/gtest/transformers_adam_w.hpp b/projects/miopen/test/gtest/transformers_adam_w.hpp
index d2a804841258..ef465fc98854 100644
--- a/projects/miopen/test/gtest/transformers_adam_w.hpp
+++ b/projects/miopen/test/gtest/transformers_adam_w.hpp
@@ -24,7 +24,6 @@
  *
  *******************************************************************************/
 #define MIOPEN_BETA_API 1
-#include "../driver/tensor_driver.hpp"
 #include "cpu_transformers_adam_w.hpp"
 #include "get_handle.hpp"
 #include "random.hpp"

From ff45eddb953db509165863cae4e8179e28bcdedd Mon Sep 17 00:00:00 2001
From: Brad Pepers <brad.pepers@amd.com>
Date: Sat, 9 May 2026 08:07:18 -0600
Subject: [PATCH 05/11] =?UTF-8?q?Eliminate=20remaining=20test=E2=86=92driv?=
 =?UTF-8?q?er=20cross-includes?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Move mloSoftmaxHost.hpp (CPU softmax reference) to miopen_utils.
Create gpu_mem.hpp forwarding header in miopen_utils for GPUMem
(temporary Phase 1 shim; GPUMem extraction is Phase 2).

Update 3 test files to include through miopen_utils instead of
directly from driver/.

Result: zero cross-includes between driver/ and test/ in either
direction. The only remaining Phase 2 cleanup items are:
- miopen_utils/gpu_mem.hpp → driver/driver.hpp (extract GPUMem)
- common_utils/random.hpp → miopen/env.hpp (env var dependency)

Co-Authored-By: Claude Sonnet 4 <noreply@anthropic.com>
---
 projects/miopen/driver/mloSoftmaxHost.hpp     | 349 +----------------
 .../include/miopen_utils/gpu_mem.hpp          |  12 +
 .../include/miopen_utils/mloSoftmaxHost.hpp   | 350 ++++++++++++++++++
 .../test/gtest/find_mode_trust_verify.cpp     |   2 +-
 .../miopen/test/gtest/kernel_tuning_net.cpp   |   2 +-
 projects/miopen/test/gtest/softmax_find20.cpp |   2 +-
 6 files changed, 367 insertions(+), 350 deletions(-)
 create mode 100644 projects/miopen/miopen_utils/include/miopen_utils/gpu_mem.hpp
 create mode 100644 projects/miopen/miopen_utils/include/miopen_utils/mloSoftmaxHost.hpp

diff --git a/projects/miopen/driver/mloSoftmaxHost.hpp b/projects/miopen/driver/mloSoftmaxHost.hpp
index fd0a1768e6a6..e0fec924c5b9 100644
--- a/projects/miopen/driver/mloSoftmaxHost.hpp
+++ b/projects/miopen/driver/mloSoftmaxHost.hpp
@@ -1,350 +1,5 @@
-// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
-// SPDX-License-Identifier: MIT
-
+// Forwarding header — implementation moved to miopen_utils.
 #ifndef MLO_SOFTMAXHOST_H_
 #define MLO_SOFTMAXHOST_H_
-
-#include <miopen/tensor.hpp>
-#include <miopen/tensor_extra.hpp>
-
-////////////////////////////////////////////////////////////
-//
-///////////////////////////////////////////////////////////
-
-#define NEGATIVE_INF_FP32 (-1e20)
-#define NEGATIVE_INF_FP16 (-1e5)
-
-template <typename T>
-T logaddexp(T x, T y, T neg_inf)
-{
-    T a = std::max(x, y);
-    T b = std::min(x, y);
-    T c = b - a;
-
-    return c <= neg_inf ? std::max(a, neg_inf) : std::max(T(a + log(T(1) + exp(b - a))), neg_inf);
-}
-
-template <typename Tgpu, typename Tcheck /* the data type used in CPU checkings (usually double) */>
-int mloSoftmaxForwardRunHost(miopenTensorDescriptor_t inputTensor,
-                             miopenTensorDescriptor_t outputTensor,
-                             Tgpu* in,
-                             Tcheck* outhost,
-                             float alpha,
-                             float beta,
-                             miopenSoftmaxAlgorithm_t algo,
-                             miopenSoftmaxMode_t mode)
-{
-    int n, c, h, w, in_nstr, in_cstr, in_hstr, in_wstr;
-    int out_nstr, out_cstr, out_hstr, out_wstr;
-    miopenGet4dTensorDescriptorLengths(inputTensor, &n, &c, &h, &w);
-    miopenGet4dTensorDescriptorStrides(inputTensor, &in_nstr, &in_cstr, &in_hstr, &in_wstr);
-    miopenGet4dTensorDescriptorStrides(outputTensor, &out_nstr, &out_cstr, &out_hstr, &out_wstr);
-
-    Tcheck max_val = (sizeof(Tgpu) == 4) ? 3.402823466e+38f : 65504.;
-    std::vector<Tcheck> channel_max((mode == MIOPEN_SOFTMAX_MODE_INSTANCE ? n : n * h * w),
-                                    static_cast<Tcheck>(-max_val));
-    std::vector<Tcheck> results(n * c * h * w, static_cast<Tcheck>(0.0));
-
-    int ret = 0;
-
-    if(mode == MIOPEN_SOFTMAX_MODE_INSTANCE)
-    {
-        for(int i = 0; i < n; i++)
-        {
-            if(algo == MIOPEN_SOFTMAX_FAST)
-            {
-                for(int j = 0; j < c; j++)
-                    for(int s0 = 0; s0 < h; s0++)
-                        for(int s1 = 0; s1 < w; s1++)
-                        {
-                            results[(i * c + j) * h * w + s0 * w + s1] = static_cast<Tcheck>(
-                                in[i * in_nstr + j * in_cstr + s0 * in_hstr + s1 * in_wstr]);
-                        }
-            }
-            else
-            {
-                for(int j = 0; j < c; j++)
-                    for(int s0 = 0; s0 < h; s0++)
-                        for(int s1 = 0; s1 < w; s1++)
-                        {
-                            channel_max[i] = std::max(
-                                static_cast<Tcheck>(
-                                    in[i * in_nstr + j * in_cstr + s0 * in_hstr + s1 * in_wstr]),
-                                channel_max[i]);
-                        }
-
-                for(int j = 0; j < c; j++)
-                    for(int s0 = 0; s0 < h; s0++)
-                        for(int s1 = 0; s1 < w; s1++)
-                        {
-                            results[(i * c + j) * h * w + s0 * w + s1] =
-                                static_cast<Tcheck>(
-                                    in[i * in_nstr + j * in_cstr + s0 * in_hstr + s1 * in_wstr]) -
-                                channel_max[i];
-                        }
-            }
-
-            if(algo == MIOPEN_SOFTMAX_LOG)
-            {
-                Tcheck neg_inf = static_cast<Tcheck>(
-                    miopen::deref(inputTensor).GetType() == miopenHalf ? NEGATIVE_INF_FP16
-                                                                       : NEGATIVE_INF_FP32);
-                channel_max[i] = neg_inf;
-                for(int j = 0; j < c; j++)
-                    for(int s0 = 0; s0 < h; s0++)
-                        for(int s1 = 0; s1 < w; s1++)
-                        {
-                            channel_max[i] = logaddexp(results[(i * c + j) * h * w + s0 * w + s1],
-                                                       channel_max[i],
-                                                       neg_inf);
-                        }
-
-                for(int j = 0; j < c; j++)
-                    for(int s0 = 0; s0 < h; s0++)
-                        for(int s1 = 0; s1 < w; s1++)
-                        {
-                            outhost[i * out_nstr + j * out_cstr + s0 * out_hstr + s1 * out_wstr] =
-                                alpha *
-                                    (results[(i * c + j) * h * w + s0 * w + s1] - channel_max[i]) +
-                                beta * outhost[i * out_nstr + j * out_cstr + s0 * out_hstr +
-                                               s1 * out_wstr];
-                        }
-            }
-            else
-            {
-                channel_max[i] = 0.0;
-                for(int j = 0; j < c; j++)
-                    for(int s0 = 0; s0 < h; s0++)
-                        for(int s1 = 0; s1 < w; s1++)
-                        {
-                            results[(i * c + j) * h * w + s0 * w + s1] =
-                                exp(results[(i * c + j) * h * w + s0 * w + s1]);
-                            channel_max[i] += results[(i * c + j) * h * w + s0 * w + s1];
-                        }
-
-                for(int j = 0; j < c; j++)
-                    for(int s0 = 0; s0 < h; s0++)
-                        for(int s1 = 0; s1 < w; s1++)
-                        {
-                            outhost[i * out_nstr + j * out_cstr + s0 * out_hstr + s1 * out_wstr] =
-                                alpha *
-                                    (results[(i * c + j) * h * w + s0 * w + s1] / channel_max[i]) +
-                                beta * outhost[i * out_nstr + j * out_cstr + s0 * out_hstr +
-                                               s1 * out_wstr];
-                        }
-            }
-        }
-    }
-    else
-    {
-        for(int i = 0; i < n; i++)
-        {
-            for(int s0 = 0; s0 < h; s0++)
-                for(int s1 = 0; s1 < w; s1++)
-                {
-                    if(algo == MIOPEN_SOFTMAX_FAST)
-                    {
-                        for(int j = 0; j < c; j++)
-                        {
-                            results[(i * c + j) * h * w + s0 * w + s1] = static_cast<Tcheck>(
-                                in[i * in_nstr + j * in_cstr + s0 * in_hstr + s1 * in_wstr]);
-                        }
-                    }
-                    else
-                    {
-                        for(int j = 0; j < c; j++)
-                        {
-                            channel_max[i * h * w + s0 * w + s1] = std::max(
-                                static_cast<Tcheck>(
-                                    in[i * in_nstr + j * in_cstr + s0 * in_hstr + s1 * in_wstr]),
-                                channel_max[i * h * w + s0 * w + s1]);
-                        }
-
-                        for(int j = 0; j < c; j++)
-                        {
-                            results[(i * c + j) * h * w + s0 * w + s1] =
-                                static_cast<Tcheck>(
-                                    in[i * in_nstr + j * in_cstr + s0 * in_hstr + s1 * in_wstr]) -
-                                channel_max[i * h * w + s0 * w + s1];
-                        }
-                    }
-
-                    if(algo == MIOPEN_SOFTMAX_LOG)
-                    {
-                        Tcheck neg_inf = static_cast<Tcheck>(
-                            miopen::deref(inputTensor).GetType() == miopenHalf ? NEGATIVE_INF_FP16
-                                                                               : NEGATIVE_INF_FP32);
-                        channel_max[i * h * w + s0 * w + s1] = results[i * c * h * w + s0 * w + s1];
-                        for(int j = 1; j < c; j++)
-                        {
-                            channel_max[i * h * w + s0 * w + s1] =
-                                logaddexp(results[(i * c + j) * h * w + s0 * w + s1],
-                                          channel_max[i * h * w + s0 * w + s1],
-                                          neg_inf);
-                        }
-
-                        for(int j = 0; j < c; j++)
-                        {
-                            outhost[i * out_nstr + j * out_cstr + s0 * out_hstr + s1 * out_wstr] =
-                                alpha * (results[(i * c + j) * h * w + s0 * w + s1] -
-                                         channel_max[i * h * w + s0 * w + s1]) +
-                                beta * outhost[i * out_nstr + j * out_cstr + s0 * out_hstr +
-                                               s1 * out_wstr];
-                        }
-                    }
-                    else
-                    {
-                        channel_max[i * h * w + s0 * w + s1] = 0.0;
-                        for(int j = 0; j < c; j++)
-                        {
-                            results[(i * c + j) * h * w + s0 * w + s1] =
-                                exp(results[(i * c + j) * h * w + s0 * w + s1]);
-                            channel_max[i * h * w + s0 * w + s1] +=
-                                results[(i * c + j) * h * w + s0 * w + s1];
-                        }
-
-                        for(int j = 0; j < c; j++)
-                        {
-                            outhost[i * out_nstr + j * out_cstr + s0 * out_hstr + s1 * out_wstr] =
-                                alpha * (results[(i * c + j) * h * w + s0 * w + s1] /
-                                         channel_max[i * h * w + s0 * w + s1]) +
-                                beta * outhost[i * out_nstr + j * out_cstr + s0 * out_hstr +
-                                               s1 * out_wstr];
-                        }
-                    }
-                }
-        }
-    }
-
-    return ret;
-}
-
-template <typename Tgpu /* the data type used in GPU computations (usually half) */,
-          typename Tcheck /* the data type used in CPU checkings (usually double) */>
-int mloSoftmaxBackwardRunHost(miopenTensorDescriptor_t dInputTensor,
-                              miopenTensorDescriptor_t dOutputTensor,
-                              Tgpu* out,
-                              Tgpu* dout,
-                              Tcheck* dinhost,
-                              float alpha,
-                              float beta,
-                              miopenSoftmaxAlgorithm_t algo,
-                              miopenSoftmaxMode_t mode)
-{
-    int n, c, h, w, in_nstr, in_cstr, in_hstr, in_wstr;
-    int out_nstr, out_cstr, out_hstr, out_wstr;
-    miopenGet4dTensorDescriptorLengths(dOutputTensor, &n, &c, &h, &w);
-    miopenGet4dTensorDescriptorStrides(dInputTensor, &in_nstr, &in_cstr, &in_hstr, &in_wstr);
-    miopenGet4dTensorDescriptorStrides(dOutputTensor, &out_nstr, &out_cstr, &out_hstr, &out_wstr);
-
-    std::vector<Tcheck> channel_dot((mode == MIOPEN_SOFTMAX_MODE_INSTANCE ? n : n * h * w),
-                                    static_cast<Tcheck>(0.0));
-    std::vector<Tcheck> results(n * c * h * w, static_cast<Tcheck>(0.0));
-
-    int ret = 0;
-
-    for(int i = 0; i < n; i++)
-    {
-        if(mode == MIOPEN_SOFTMAX_MODE_INSTANCE)
-        {
-            for(int j = 0; j < c; j++)
-                for(int s0 = 0; s0 < h; s0++)
-                    for(int s1 = 0; s1 < w; s1++)
-                    {
-                        if(algo == MIOPEN_SOFTMAX_LOG)
-                        {
-                            channel_dot[i] += static_cast<Tcheck>(
-                                dout[i * out_nstr + j * out_cstr + s0 * out_hstr + s1 * out_wstr]);
-                        }
-                        else
-                        {
-                            channel_dot[i] +=
-                                static_cast<Tcheck>(out[i * out_nstr + j * out_cstr +
-                                                        s0 * out_hstr + s1 * out_wstr]) *
-                                static_cast<Tcheck>(dout[i * out_nstr + j * out_cstr +
-                                                         s0 * out_hstr + s1 * out_wstr]);
-                        }
-                    }
-
-            for(int j = 0; j < c; j++)
-                for(int s0 = 0; s0 < h; s0++)
-                    for(int s1 = 0; s1 < w; s1++)
-                    {
-                        if(algo == MIOPEN_SOFTMAX_LOG)
-                        {
-                            results[(i * c + j) * h * w + s0 * w + s1] =
-                                static_cast<Tcheck>(dout[i * out_nstr + j * out_cstr +
-                                                         s0 * out_hstr + s1 * out_wstr]) -
-                                channel_dot[i] * std::exp(out[i * out_nstr + j * out_cstr +
-                                                              s0 * out_hstr + s1 * out_wstr]);
-                        }
-                        else
-                        {
-                            results[(i * c + j) * h * w + s0 * w + s1] =
-                                static_cast<Tcheck>(dout[i * out_nstr + j * out_cstr +
-                                                         s0 * out_hstr + s1 * out_wstr]) -
-                                channel_dot[i];
-
-                            results[(i * c + j) * h * w + s0 * w + s1] *= static_cast<Tcheck>(
-                                out[i * out_nstr + j * out_cstr + s0 * out_hstr + s1 * out_wstr]);
-                        }
-                        dinhost[i * in_nstr + j * in_cstr + s0 * in_hstr + s1 * in_wstr] =
-                            alpha * results[(i * c + j) * h * w + s0 * w + s1] +
-                            beta * dinhost[i * in_nstr + j * in_cstr + s0 * in_hstr + s1 * in_wstr];
-                    }
-        }
-        else
-        {
-            for(int s0 = 0; s0 < h; s0++)
-                for(int s1 = 0; s1 < w; s1++)
-                {
-                    for(int j = 0; j < c; j++)
-                    {
-                        if(algo == MIOPEN_SOFTMAX_LOG)
-                        {
-                            channel_dot[i * h * w + s0 * w + s1] += static_cast<Tcheck>(
-                                dout[i * out_nstr + j * out_cstr + s0 * out_hstr + s1 * out_wstr]);
-                        }
-                        else
-                        {
-                            channel_dot[i * h * w + s0 * w + s1] +=
-                                static_cast<Tcheck>(out[i * out_nstr + j * out_cstr +
-                                                        s0 * out_hstr + s1 * out_wstr]) *
-                                static_cast<Tcheck>(dout[i * out_nstr + j * out_cstr +
-                                                         s0 * out_hstr + s1 * out_wstr]);
-                        }
-                    }
-
-                    for(int j = 0; j < c; j++)
-                    {
-                        if(algo == MIOPEN_SOFTMAX_LOG)
-                        {
-                            results[(i * c + j) * h * w + s0 * w + s1] =
-                                static_cast<Tcheck>(dout[i * out_nstr + j * out_cstr +
-                                                         s0 * out_hstr + s1 * out_wstr]) -
-                                channel_dot[i * h * w + s0 * w + s1] *
-                                    std::exp(out[i * out_nstr + j * out_cstr + s0 * out_hstr +
-                                                 s1 * out_wstr]);
-                        }
-                        else
-                        {
-                            results[(i * c + j) * h * w + s0 * w + s1] =
-                                static_cast<Tcheck>(dout[i * out_nstr + j * out_cstr +
-                                                         s0 * out_hstr + s1 * out_wstr]) -
-                                channel_dot[i * h * w + s0 * w + s1];
-
-                            results[(i * c + j) * h * w + s0 * w + s1] *= static_cast<Tcheck>(
-                                out[i * out_nstr + j * out_cstr + s0 * out_hstr + s1 * out_wstr]);
-                        }
-                        dinhost[i * in_nstr + j * in_cstr + s0 * in_hstr + s1 * in_wstr] =
-                            alpha * results[(i * c + j) * h * w + s0 * w + s1] +
-                            beta * dinhost[i * in_nstr + j * in_cstr + s0 * in_hstr + s1 * in_wstr];
-                    }
-                }
-        }
-    }
-
-    return ret;
-}
-
+#include <miopen_utils/mloSoftmaxHost.hpp>
 #endif
diff --git a/projects/miopen/miopen_utils/include/miopen_utils/gpu_mem.hpp b/projects/miopen/miopen_utils/include/miopen_utils/gpu_mem.hpp
new file mode 100644
index 000000000000..ee1f52b3090d
--- /dev/null
+++ b/projects/miopen/miopen_utils/include/miopen_utils/gpu_mem.hpp
@@ -0,0 +1,12 @@
+// Forwarding header — GPUMem is defined in driver/driver.hpp.
+// This allows test code to include GPUMem without directly depending
+// on the driver/ directory. The GPUMem class should eventually be
+// extracted into a standalone header here.
+#ifndef GUARD_MIOPEN_UTILS_GPU_MEM_HPP
+#define GUARD_MIOPEN_UTILS_GPU_MEM_HPP
+
+// Phase 1: Forward to driver.hpp which defines GPUMem.
+// Phase 2: Extract GPUMem into this file directly.
+#include "../../driver/driver.hpp"
+
+#endif
diff --git a/projects/miopen/miopen_utils/include/miopen_utils/mloSoftmaxHost.hpp b/projects/miopen/miopen_utils/include/miopen_utils/mloSoftmaxHost.hpp
new file mode 100644
index 000000000000..fd0a1768e6a6
--- /dev/null
+++ b/projects/miopen/miopen_utils/include/miopen_utils/mloSoftmaxHost.hpp
@@ -0,0 +1,350 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#ifndef MLO_SOFTMAXHOST_H_
+#define MLO_SOFTMAXHOST_H_
+
+#include <miopen/tensor.hpp>
+#include <miopen/tensor_extra.hpp>
+
+////////////////////////////////////////////////////////////
+//
+///////////////////////////////////////////////////////////
+
+#define NEGATIVE_INF_FP32 (-1e20)
+#define NEGATIVE_INF_FP16 (-1e5)
+
+template <typename T>
+T logaddexp(T x, T y, T neg_inf)
+{
+    T a = std::max(x, y);
+    T b = std::min(x, y);
+    T c = b - a;
+
+    return c <= neg_inf ? std::max(a, neg_inf) : std::max(T(a + log(T(1) + exp(b - a))), neg_inf);
+}
+
+template <typename Tgpu, typename Tcheck /* the data type used in CPU checkings (usually double) */>
+int mloSoftmaxForwardRunHost(miopenTensorDescriptor_t inputTensor,
+                             miopenTensorDescriptor_t outputTensor,
+                             Tgpu* in,
+                             Tcheck* outhost,
+                             float alpha,
+                             float beta,
+                             miopenSoftmaxAlgorithm_t algo,
+                             miopenSoftmaxMode_t mode)
+{
+    int n, c, h, w, in_nstr, in_cstr, in_hstr, in_wstr;
+    int out_nstr, out_cstr, out_hstr, out_wstr;
+    miopenGet4dTensorDescriptorLengths(inputTensor, &n, &c, &h, &w);
+    miopenGet4dTensorDescriptorStrides(inputTensor, &in_nstr, &in_cstr, &in_hstr, &in_wstr);
+    miopenGet4dTensorDescriptorStrides(outputTensor, &out_nstr, &out_cstr, &out_hstr, &out_wstr);
+
+    Tcheck max_val = (sizeof(Tgpu) == 4) ? 3.402823466e+38f : 65504.;
+    std::vector<Tcheck> channel_max((mode == MIOPEN_SOFTMAX_MODE_INSTANCE ? n : n * h * w),
+                                    static_cast<Tcheck>(-max_val));
+    std::vector<Tcheck> results(n * c * h * w, static_cast<Tcheck>(0.0));
+
+    int ret = 0;
+
+    if(mode == MIOPEN_SOFTMAX_MODE_INSTANCE)
+    {
+        for(int i = 0; i < n; i++)
+        {
+            if(algo == MIOPEN_SOFTMAX_FAST)
+            {
+                for(int j = 0; j < c; j++)
+                    for(int s0 = 0; s0 < h; s0++)
+                        for(int s1 = 0; s1 < w; s1++)
+                        {
+                            results[(i * c + j) * h * w + s0 * w + s1] = static_cast<Tcheck>(
+                                in[i * in_nstr + j * in_cstr + s0 * in_hstr + s1 * in_wstr]);
+                        }
+            }
+            else
+            {
+                for(int j = 0; j < c; j++)
+                    for(int s0 = 0; s0 < h; s0++)
+                        for(int s1 = 0; s1 < w; s1++)
+                        {
+                            channel_max[i] = std::max(
+                                static_cast<Tcheck>(
+                                    in[i * in_nstr + j * in_cstr + s0 * in_hstr + s1 * in_wstr]),
+                                channel_max[i]);
+                        }
+
+                for(int j = 0; j < c; j++)
+                    for(int s0 = 0; s0 < h; s0++)
+                        for(int s1 = 0; s1 < w; s1++)
+                        {
+                            results[(i * c + j) * h * w + s0 * w + s1] =
+                                static_cast<Tcheck>(
+                                    in[i * in_nstr + j * in_cstr + s0 * in_hstr + s1 * in_wstr]) -
+                                channel_max[i];
+                        }
+            }
+
+            if(algo == MIOPEN_SOFTMAX_LOG)
+            {
+                Tcheck neg_inf = static_cast<Tcheck>(
+                    miopen::deref(inputTensor).GetType() == miopenHalf ? NEGATIVE_INF_FP16
+                                                                       : NEGATIVE_INF_FP32);
+                channel_max[i] = neg_inf;
+                for(int j = 0; j < c; j++)
+                    for(int s0 = 0; s0 < h; s0++)
+                        for(int s1 = 0; s1 < w; s1++)
+                        {
+                            channel_max[i] = logaddexp(results[(i * c + j) * h * w + s0 * w + s1],
+                                                       channel_max[i],
+                                                       neg_inf);
+                        }
+
+                for(int j = 0; j < c; j++)
+                    for(int s0 = 0; s0 < h; s0++)
+                        for(int s1 = 0; s1 < w; s1++)
+                        {
+                            outhost[i * out_nstr + j * out_cstr + s0 * out_hstr + s1 * out_wstr] =
+                                alpha *
+                                    (results[(i * c + j) * h * w + s0 * w + s1] - channel_max[i]) +
+                                beta * outhost[i * out_nstr + j * out_cstr + s0 * out_hstr +
+                                               s1 * out_wstr];
+                        }
+            }
+            else
+            {
+                channel_max[i] = 0.0;
+                for(int j = 0; j < c; j++)
+                    for(int s0 = 0; s0 < h; s0++)
+                        for(int s1 = 0; s1 < w; s1++)
+                        {
+                            results[(i * c + j) * h * w + s0 * w + s1] =
+                                exp(results[(i * c + j) * h * w + s0 * w + s1]);
+                            channel_max[i] += results[(i * c + j) * h * w + s0 * w + s1];
+                        }
+
+                for(int j = 0; j < c; j++)
+                    for(int s0 = 0; s0 < h; s0++)
+                        for(int s1 = 0; s1 < w; s1++)
+                        {
+                            outhost[i * out_nstr + j * out_cstr + s0 * out_hstr + s1 * out_wstr] =
+                                alpha *
+                                    (results[(i * c + j) * h * w + s0 * w + s1] / channel_max[i]) +
+                                beta * outhost[i * out_nstr + j * out_cstr + s0 * out_hstr +
+                                               s1 * out_wstr];
+                        }
+            }
+        }
+    }
+    else
+    {
+        for(int i = 0; i < n; i++)
+        {
+            for(int s0 = 0; s0 < h; s0++)
+                for(int s1 = 0; s1 < w; s1++)
+                {
+                    if(algo == MIOPEN_SOFTMAX_FAST)
+                    {
+                        for(int j = 0; j < c; j++)
+                        {
+                            results[(i * c + j) * h * w + s0 * w + s1] = static_cast<Tcheck>(
+                                in[i * in_nstr + j * in_cstr + s0 * in_hstr + s1 * in_wstr]);
+                        }
+                    }
+                    else
+                    {
+                        for(int j = 0; j < c; j++)
+                        {
+                            channel_max[i * h * w + s0 * w + s1] = std::max(
+                                static_cast<Tcheck>(
+                                    in[i * in_nstr + j * in_cstr + s0 * in_hstr + s1 * in_wstr]),
+                                channel_max[i * h * w + s0 * w + s1]);
+                        }
+
+                        for(int j = 0; j < c; j++)
+                        {
+                            results[(i * c + j) * h * w + s0 * w + s1] =
+                                static_cast<Tcheck>(
+                                    in[i * in_nstr + j * in_cstr + s0 * in_hstr + s1 * in_wstr]) -
+                                channel_max[i * h * w + s0 * w + s1];
+                        }
+                    }
+
+                    if(algo == MIOPEN_SOFTMAX_LOG)
+                    {
+                        Tcheck neg_inf = static_cast<Tcheck>(
+                            miopen::deref(inputTensor).GetType() == miopenHalf ? NEGATIVE_INF_FP16
+                                                                               : NEGATIVE_INF_FP32);
+                        channel_max[i * h * w + s0 * w + s1] = results[i * c * h * w + s0 * w + s1];
+                        for(int j = 1; j < c; j++)
+                        {
+                            channel_max[i * h * w + s0 * w + s1] =
+                                logaddexp(results[(i * c + j) * h * w + s0 * w + s1],
+                                          channel_max[i * h * w + s0 * w + s1],
+                                          neg_inf);
+                        }
+
+                        for(int j = 0; j < c; j++)
+                        {
+                            outhost[i * out_nstr + j * out_cstr + s0 * out_hstr + s1 * out_wstr] =
+                                alpha * (results[(i * c + j) * h * w + s0 * w + s1] -
+                                         channel_max[i * h * w + s0 * w + s1]) +
+                                beta * outhost[i * out_nstr + j * out_cstr + s0 * out_hstr +
+                                               s1 * out_wstr];
+                        }
+                    }
+                    else
+                    {
+                        channel_max[i * h * w + s0 * w + s1] = 0.0;
+                        for(int j = 0; j < c; j++)
+                        {
+                            results[(i * c + j) * h * w + s0 * w + s1] =
+                                exp(results[(i * c + j) * h * w + s0 * w + s1]);
+                            channel_max[i * h * w + s0 * w + s1] +=
+                                results[(i * c + j) * h * w + s0 * w + s1];
+                        }
+
+                        for(int j = 0; j < c; j++)
+                        {
+                            outhost[i * out_nstr + j * out_cstr + s0 * out_hstr + s1 * out_wstr] =
+                                alpha * (results[(i * c + j) * h * w + s0 * w + s1] /
+                                         channel_max[i * h * w + s0 * w + s1]) +
+                                beta * outhost[i * out_nstr + j * out_cstr + s0 * out_hstr +
+                                               s1 * out_wstr];
+                        }
+                    }
+                }
+        }
+    }
+
+    return ret;
+}
+
+template <typename Tgpu /* the data type used in GPU computations (usually half) */,
+          typename Tcheck /* the data type used in CPU checkings (usually double) */>
+int mloSoftmaxBackwardRunHost(miopenTensorDescriptor_t dInputTensor,
+                              miopenTensorDescriptor_t dOutputTensor,
+                              Tgpu* out,
+                              Tgpu* dout,
+                              Tcheck* dinhost,
+                              float alpha,
+                              float beta,
+                              miopenSoftmaxAlgorithm_t algo,
+                              miopenSoftmaxMode_t mode)
+{
+    int n, c, h, w, in_nstr, in_cstr, in_hstr, in_wstr;
+    int out_nstr, out_cstr, out_hstr, out_wstr;
+    miopenGet4dTensorDescriptorLengths(dOutputTensor, &n, &c, &h, &w);
+    miopenGet4dTensorDescriptorStrides(dInputTensor, &in_nstr, &in_cstr, &in_hstr, &in_wstr);
+    miopenGet4dTensorDescriptorStrides(dOutputTensor, &out_nstr, &out_cstr, &out_hstr, &out_wstr);
+
+    std::vector<Tcheck> channel_dot((mode == MIOPEN_SOFTMAX_MODE_INSTANCE ? n : n * h * w),
+                                    static_cast<Tcheck>(0.0));
+    std::vector<Tcheck> results(n * c * h * w, static_cast<Tcheck>(0.0));
+
+    int ret = 0;
+
+    for(int i = 0; i < n; i++)
+    {
+        if(mode == MIOPEN_SOFTMAX_MODE_INSTANCE)
+        {
+            for(int j = 0; j < c; j++)
+                for(int s0 = 0; s0 < h; s0++)
+                    for(int s1 = 0; s1 < w; s1++)
+                    {
+                        if(algo == MIOPEN_SOFTMAX_LOG)
+                        {
+                            channel_dot[i] += static_cast<Tcheck>(
+                                dout[i * out_nstr + j * out_cstr + s0 * out_hstr + s1 * out_wstr]);
+                        }
+                        else
+                        {
+                            channel_dot[i] +=
+                                static_cast<Tcheck>(out[i * out_nstr + j * out_cstr +
+                                                        s0 * out_hstr + s1 * out_wstr]) *
+                                static_cast<Tcheck>(dout[i * out_nstr + j * out_cstr +
+                                                         s0 * out_hstr + s1 * out_wstr]);
+                        }
+                    }
+
+            for(int j = 0; j < c; j++)
+                for(int s0 = 0; s0 < h; s0++)
+                    for(int s1 = 0; s1 < w; s1++)
+                    {
+                        if(algo == MIOPEN_SOFTMAX_LOG)
+                        {
+                            results[(i * c + j) * h * w + s0 * w + s1] =
+                                static_cast<Tcheck>(dout[i * out_nstr + j * out_cstr +
+                                                         s0 * out_hstr + s1 * out_wstr]) -
+                                channel_dot[i] * std::exp(out[i * out_nstr + j * out_cstr +
+                                                              s0 * out_hstr + s1 * out_wstr]);
+                        }
+                        else
+                        {
+                            results[(i * c + j) * h * w + s0 * w + s1] =
+                                static_cast<Tcheck>(dout[i * out_nstr + j * out_cstr +
+                                                         s0 * out_hstr + s1 * out_wstr]) -
+                                channel_dot[i];
+
+                            results[(i * c + j) * h * w + s0 * w + s1] *= static_cast<Tcheck>(
+                                out[i * out_nstr + j * out_cstr + s0 * out_hstr + s1 * out_wstr]);
+                        }
+                        dinhost[i * in_nstr + j * in_cstr + s0 * in_hstr + s1 * in_wstr] =
+                            alpha * results[(i * c + j) * h * w + s0 * w + s1] +
+                            beta * dinhost[i * in_nstr + j * in_cstr + s0 * in_hstr + s1 * in_wstr];
+                    }
+        }
+        else
+        {
+            for(int s0 = 0; s0 < h; s0++)
+                for(int s1 = 0; s1 < w; s1++)
+                {
+                    for(int j = 0; j < c; j++)
+                    {
+                        if(algo == MIOPEN_SOFTMAX_LOG)
+                        {
+                            channel_dot[i * h * w + s0 * w + s1] += static_cast<Tcheck>(
+                                dout[i * out_nstr + j * out_cstr + s0 * out_hstr + s1 * out_wstr]);
+                        }
+                        else
+                        {
+                            channel_dot[i * h * w + s0 * w + s1] +=
+                                static_cast<Tcheck>(out[i * out_nstr + j * out_cstr +
+                                                        s0 * out_hstr + s1 * out_wstr]) *
+                                static_cast<Tcheck>(dout[i * out_nstr + j * out_cstr +
+                                                         s0 * out_hstr + s1 * out_wstr]);
+                        }
+                    }
+
+                    for(int j = 0; j < c; j++)
+                    {
+                        if(algo == MIOPEN_SOFTMAX_LOG)
+                        {
+                            results[(i * c + j) * h * w + s0 * w + s1] =
+                                static_cast<Tcheck>(dout[i * out_nstr + j * out_cstr +
+                                                         s0 * out_hstr + s1 * out_wstr]) -
+                                channel_dot[i * h * w + s0 * w + s1] *
+                                    std::exp(out[i * out_nstr + j * out_cstr + s0 * out_hstr +
+                                                 s1 * out_wstr]);
+                        }
+                        else
+                        {
+                            results[(i * c + j) * h * w + s0 * w + s1] =
+                                static_cast<Tcheck>(dout[i * out_nstr + j * out_cstr +
+                                                         s0 * out_hstr + s1 * out_wstr]) -
+                                channel_dot[i * h * w + s0 * w + s1];
+
+                            results[(i * c + j) * h * w + s0 * w + s1] *= static_cast<Tcheck>(
+                                out[i * out_nstr + j * out_cstr + s0 * out_hstr + s1 * out_wstr]);
+                        }
+                        dinhost[i * in_nstr + j * in_cstr + s0 * in_hstr + s1 * in_wstr] =
+                            alpha * results[(i * c + j) * h * w + s0 * w + s1] +
+                            beta * dinhost[i * in_nstr + j * in_cstr + s0 * in_hstr + s1 * in_wstr];
+                    }
+                }
+        }
+    }
+
+    return ret;
+}
+
+#endif
diff --git a/projects/miopen/test/gtest/find_mode_trust_verify.cpp b/projects/miopen/test/gtest/find_mode_trust_verify.cpp
index 021a593f3372..178b1edff149 100644
--- a/projects/miopen/test/gtest/find_mode_trust_verify.cpp
+++ b/projects/miopen/test/gtest/find_mode_trust_verify.cpp
@@ -26,7 +26,7 @@
 
 #include <gtest/group_conv.hpp>
 #include <miopen/datatype.hpp>
-#include "../../driver/driver.hpp"
+#include <miopen_utils/gpu_mem.hpp>
 
 namespace miopen {
 std::vector<solver::ConvSolution>
diff --git a/projects/miopen/test/gtest/kernel_tuning_net.cpp b/projects/miopen/test/gtest/kernel_tuning_net.cpp
index 304adb9800d4..760a099b2ef4 100644
--- a/projects/miopen/test/gtest/kernel_tuning_net.cpp
+++ b/projects/miopen/test/gtest/kernel_tuning_net.cpp
@@ -30,7 +30,7 @@
 #include <miopen/conv/solvers.hpp>
 #include <miopen/conv/heuristics/ai_heuristics.hpp>
 #include <miopen/datatype.hpp>
-#include "../../driver/driver.hpp"
+#include <miopen_utils/gpu_mem.hpp>
 
 struct KernelTuningNetTestCase : AIModelTestCase
 {
diff --git a/projects/miopen/test/gtest/softmax_find20.cpp b/projects/miopen/test/gtest/softmax_find20.cpp
index c3f4857c38c8..d9acb567b7c8 100644
--- a/projects/miopen/test/gtest/softmax_find20.cpp
+++ b/projects/miopen/test/gtest/softmax_find20.cpp
@@ -28,7 +28,7 @@
 #include "test.hpp"
 #include "get_handle.hpp"
 #include "tensor_holder.hpp"
-#include "../driver/mloSoftmaxHost.hpp"
+#include <miopen_utils/mloSoftmaxHost.hpp>
 #include "verify.hpp"
 
 #include <miopen/softmax.hpp>

From 2da91e01a89914c945c33d18ac7fa412a85c8a03 Mon Sep 17 00:00:00 2001
From: Brad Pepers <brad.pepers@amd.com>
Date: Sat, 9 May 2026 10:19:25 -0600
Subject: [PATCH 06/11] Fix forwarding headers: remove duplicate include guards
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The forwarding headers (e.g., src/include/miopen/rank.hpp) and their
targets (e.g., common_utils/include/common_utils/rank.hpp) used the
same include guard macro. This caused the target's content to be
skipped when included through the forwarding header, since the guard
was already defined by the forwarder.

Fix: Remove include guards from all forwarding headers entirely.
They contain no content of their own — just a single #include — so
the target file's own guard provides all necessary protection.

Affects 26 forwarding headers across src/include/miopen/, test/,
and driver/.

Co-Authored-By: Claude Sonnet 4 <noreply@anthropic.com>
---
 projects/miopen/driver/mloSoftmaxHost.hpp     |  3 --
 projects/miopen/driver/random.hpp             |  3 --
 .../miopen/src/include/miopen/algorithm.hpp   | 30 +------------------
 .../miopen/src/include/miopen/bfloat16.hpp    |  3 --
 .../miopen/src/include/miopen/each_args.hpp   | 30 +------------------
 .../miopen/src/include/miopen/float_equal.hpp | 30 +------------------
 projects/miopen/src/include/miopen/ford.hpp   |  3 --
 .../miopen/src/include/miopen/functional.hpp  |  3 --
 .../miopen/src/include/miopen/par_for.hpp     | 30 +------------------
 projects/miopen/src/include/miopen/rank.hpp   | 30 +------------------
 .../src/include/miopen/reduce_common.hpp      |  3 --
 .../miopen/src/include/miopen/returns.hpp     | 30 +------------------
 .../miopen/src/include/miopen/stringutils.hpp |  3 --
 .../miopen/src/include/miopen/type_name.hpp   | 30 +------------------
 projects/miopen/test/cpu_bias.hpp             |  3 --
 projects/miopen/test/cpu_conv.hpp             |  3 --
 projects/miopen/test/cpu_layernorm.hpp        |  3 --
 projects/miopen/test/cpu_reduce_util.hpp      |  3 --
 projects/miopen/test/fusionHost.hpp           |  1 -
 projects/miopen/test/gemm.hpp                 |  3 --
 projects/miopen/test/network_data.hpp         |  3 --
 projects/miopen/test/random.hpp               |  3 --
 projects/miopen/test/rnn_util.hpp             |  3 --
 projects/miopen/test/serialize.hpp            |  3 --
 projects/miopen/test/tensor_holder.hpp        |  3 --
 projects/miopen/test/verify.hpp               |  3 --
 26 files changed, 7 insertions(+), 258 deletions(-)

diff --git a/projects/miopen/driver/mloSoftmaxHost.hpp b/projects/miopen/driver/mloSoftmaxHost.hpp
index e0fec924c5b9..928eb6f63490 100644
--- a/projects/miopen/driver/mloSoftmaxHost.hpp
+++ b/projects/miopen/driver/mloSoftmaxHost.hpp
@@ -1,5 +1,2 @@
 // Forwarding header — implementation moved to miopen_utils.
-#ifndef MLO_SOFTMAXHOST_H_
-#define MLO_SOFTMAXHOST_H_
 #include <miopen_utils/mloSoftmaxHost.hpp>
-#endif
diff --git a/projects/miopen/driver/random.hpp b/projects/miopen/driver/random.hpp
index 81e630411c67..30be9387d99c 100644
--- a/projects/miopen/driver/random.hpp
+++ b/projects/miopen/driver/random.hpp
@@ -1,5 +1,2 @@
 // Forwarding header — implementation moved to common_utils.
-#ifndef GUARD_RANDOM_GEN_
-#define GUARD_RANDOM_GEN_
 #include <common_utils/random.hpp>
-#endif
diff --git a/projects/miopen/src/include/miopen/algorithm.hpp b/projects/miopen/src/include/miopen/algorithm.hpp
index 91b0383b823b..38b87c1e38b4 100644
--- a/projects/miopen/src/include/miopen/algorithm.hpp
+++ b/projects/miopen/src/include/miopen/algorithm.hpp
@@ -1,30 +1,2 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2019 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-// Forwarding header -- implementation moved to common_utils.
-#ifndef GUARD_MLOPEN_ALGORITHM_HPP
-#define GUARD_MLOPEN_ALGORITHM_HPP
+// Forwarding header — implementation moved to common_utils.
 #include <common_utils/algorithm.hpp>
-#endif
diff --git a/projects/miopen/src/include/miopen/bfloat16.hpp b/projects/miopen/src/include/miopen/bfloat16.hpp
index fc3880629c68..eab3c5b2c826 100644
--- a/projects/miopen/src/include/miopen/bfloat16.hpp
+++ b/projects/miopen/src/include/miopen/bfloat16.hpp
@@ -1,5 +1,2 @@
 // Forwarding header — implementation moved to common_utils.
-#ifndef BFLOAT16_H_
-#define BFLOAT16_H_
 #include <common_utils/bfloat16.hpp>
-#endif
diff --git a/projects/miopen/src/include/miopen/each_args.hpp b/projects/miopen/src/include/miopen/each_args.hpp
index 646fd53d263f..983c7da843dd 100644
--- a/projects/miopen/src/include/miopen/each_args.hpp
+++ b/projects/miopen/src/include/miopen/each_args.hpp
@@ -1,30 +1,2 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2017 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-// Forwarding header -- implementation moved to common_utils.
-#ifndef GUARD_MIOPEN_EACH_ARGS_HPP
-#define GUARD_MIOPEN_EACH_ARGS_HPP
+// Forwarding header — implementation moved to common_utils.
 #include <common_utils/each_args.hpp>
-#endif
diff --git a/projects/miopen/src/include/miopen/float_equal.hpp b/projects/miopen/src/include/miopen/float_equal.hpp
index 43bd3d7ab14a..a48c2e417489 100644
--- a/projects/miopen/src/include/miopen/float_equal.hpp
+++ b/projects/miopen/src/include/miopen/float_equal.hpp
@@ -1,30 +1,2 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2017 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-// Forwarding header -- implementation moved to common_utils.
-#ifndef GUARD_MLOPEN_FLOAT_EQUAL_HPP
-#define GUARD_MLOPEN_FLOAT_EQUAL_HPP
+// Forwarding header — implementation moved to common_utils.
 #include <common_utils/float_equal.hpp>
-#endif
diff --git a/projects/miopen/src/include/miopen/ford.hpp b/projects/miopen/src/include/miopen/ford.hpp
index 0dc62c9ae495..beac57e1e6e8 100644
--- a/projects/miopen/src/include/miopen/ford.hpp
+++ b/projects/miopen/src/include/miopen/ford.hpp
@@ -1,5 +1,2 @@
 // Forwarding header — implementation moved to common_utils.
-#ifndef GUARD_FORD_HPP
-#define GUARD_FORD_HPP
 #include <common_utils/ford.hpp>
-#endif
diff --git a/projects/miopen/src/include/miopen/functional.hpp b/projects/miopen/src/include/miopen/functional.hpp
index d1f7cb973349..d0a70ae6794d 100644
--- a/projects/miopen/src/include/miopen/functional.hpp
+++ b/projects/miopen/src/include/miopen/functional.hpp
@@ -1,5 +1,2 @@
 // Forwarding header — implementation moved to common_utils.
-#ifndef GUARD_MLOPEN_FUNCTIONAL_HPP
-#define GUARD_MLOPEN_FUNCTIONAL_HPP
 #include <common_utils/functional.hpp>
-#endif
diff --git a/projects/miopen/src/include/miopen/par_for.hpp b/projects/miopen/src/include/miopen/par_for.hpp
index 71a1125de408..4685b005db77 100644
--- a/projects/miopen/src/include/miopen/par_for.hpp
+++ b/projects/miopen/src/include/miopen/par_for.hpp
@@ -1,30 +1,2 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2020 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-// Forwarding header -- implementation moved to common_utils.
-#ifndef MIOPEN_GUARD_MLOPEN_PAR_FOR_HPP
-#define MIOPEN_GUARD_MLOPEN_PAR_FOR_HPP
+// Forwarding header — implementation moved to common_utils.
 #include <common_utils/par_for.hpp>
-#endif
diff --git a/projects/miopen/src/include/miopen/rank.hpp b/projects/miopen/src/include/miopen/rank.hpp
index 1756782673ad..88a4541421d4 100644
--- a/projects/miopen/src/include/miopen/rank.hpp
+++ b/projects/miopen/src/include/miopen/rank.hpp
@@ -1,30 +1,2 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2017 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-// Forwarding header -- implementation moved to common_utils.
-#ifndef GUARD_MIOPEN_RANK_HPP
-#define GUARD_MIOPEN_RANK_HPP
+// Forwarding header — implementation moved to common_utils.
 #include <common_utils/rank.hpp>
-#endif
diff --git a/projects/miopen/src/include/miopen/reduce_common.hpp b/projects/miopen/src/include/miopen/reduce_common.hpp
index f1bd0b38e320..8d47ee0f05b0 100644
--- a/projects/miopen/src/include/miopen/reduce_common.hpp
+++ b/projects/miopen/src/include/miopen/reduce_common.hpp
@@ -1,5 +1,2 @@
 // Forwarding header — implementation moved to common_utils.
-#ifndef GUARD_MIOPEN_REDUCE_COMMON_HPP
-#define GUARD_MIOPEN_REDUCE_COMMON_HPP
 #include <common_utils/reduce_common.hpp>
-#endif
diff --git a/projects/miopen/src/include/miopen/returns.hpp b/projects/miopen/src/include/miopen/returns.hpp
index dd0873cfb2b3..8bd3067fdea3 100644
--- a/projects/miopen/src/include/miopen/returns.hpp
+++ b/projects/miopen/src/include/miopen/returns.hpp
@@ -1,30 +1,2 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2017 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-// Forwarding header -- implementation moved to common_utils.
-#ifndef GUARD_MIOPEN_RETURNS_HPP
-#define GUARD_MIOPEN_RETURNS_HPP
+// Forwarding header — implementation moved to common_utils.
 #include <common_utils/returns.hpp>
-#endif
diff --git a/projects/miopen/src/include/miopen/stringutils.hpp b/projects/miopen/src/include/miopen/stringutils.hpp
index 38f52efd1cf6..168eb6bee75e 100644
--- a/projects/miopen/src/include/miopen/stringutils.hpp
+++ b/projects/miopen/src/include/miopen/stringutils.hpp
@@ -1,5 +1,2 @@
 // Forwarding header — implementation moved to common_utils.
-#ifndef GUARD_MIOPEN_STRINGUTILS_HPP
-#define GUARD_MIOPEN_STRINGUTILS_HPP
 #include <common_utils/stringutils.hpp>
-#endif
diff --git a/projects/miopen/src/include/miopen/type_name.hpp b/projects/miopen/src/include/miopen/type_name.hpp
index d2cce63d3d32..4f4afd78def0 100644
--- a/projects/miopen/src/include/miopen/type_name.hpp
+++ b/projects/miopen/src/include/miopen/type_name.hpp
@@ -1,30 +1,2 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2017-2024 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-// Forwarding header -- implementation moved to common_utils.
-#ifndef GUARD_TYPE_NAME_HPP
-#define GUARD_TYPE_NAME_HPP
+// Forwarding header — implementation moved to common_utils.
 #include <common_utils/type_name.hpp>
-#endif
diff --git a/projects/miopen/test/cpu_bias.hpp b/projects/miopen/test/cpu_bias.hpp
index 4b150035d5c0..2abbcccde0da 100644
--- a/projects/miopen/test/cpu_bias.hpp
+++ b/projects/miopen/test/cpu_bias.hpp
@@ -1,5 +1,2 @@
 // Forwarding header — implementation moved to miopen_utils.
-#ifndef GUARD_CPU_BIAS_HPP
-#define GUARD_CPU_BIAS_HPP
 #include <miopen_utils/cpu_bias.hpp>
-#endif
diff --git a/projects/miopen/test/cpu_conv.hpp b/projects/miopen/test/cpu_conv.hpp
index fac5227efe75..818e215c45e2 100644
--- a/projects/miopen/test/cpu_conv.hpp
+++ b/projects/miopen/test/cpu_conv.hpp
@@ -1,5 +1,2 @@
 // Forwarding header — implementation moved to miopen_utils.
-#ifndef GUARD_CPU_CONV_HPP
-#define GUARD_CPU_CONV_HPP
 #include <miopen_utils/cpu_conv.hpp>
-#endif
diff --git a/projects/miopen/test/cpu_layernorm.hpp b/projects/miopen/test/cpu_layernorm.hpp
index 9f1c7a55ba42..a9f7b139484c 100644
--- a/projects/miopen/test/cpu_layernorm.hpp
+++ b/projects/miopen/test/cpu_layernorm.hpp
@@ -1,5 +1,2 @@
 // Forwarding header — implementation moved to miopen_utils.
-#ifndef GUARD_CPU_CONV_HPP
-#define GUARD_CPU_CONV_HPP
 #include <miopen_utils/cpu_layernorm.hpp>
-#endif
diff --git a/projects/miopen/test/cpu_reduce_util.hpp b/projects/miopen/test/cpu_reduce_util.hpp
index 73de3b18e2e1..401dd20b994b 100644
--- a/projects/miopen/test/cpu_reduce_util.hpp
+++ b/projects/miopen/test/cpu_reduce_util.hpp
@@ -1,5 +1,2 @@
 // Forwarding header — implementation moved to miopen_utils.
-#ifndef GUARD_CPU_REDUCE_UTIL_HPP
-#define GUARD_CPU_REDUCE_UTIL_HPP
 #include <miopen_utils/cpu_reduce_util.hpp>
-#endif
diff --git a/projects/miopen/test/fusionHost.hpp b/projects/miopen/test/fusionHost.hpp
index a13ee5601cd4..c95d14da6f82 100644
--- a/projects/miopen/test/fusionHost.hpp
+++ b/projects/miopen/test/fusionHost.hpp
@@ -1,3 +1,2 @@
 // Forwarding header — implementation moved to miopen_utils.
-#pragma once
 #include <miopen_utils/fusionHost.hpp>
diff --git a/projects/miopen/test/gemm.hpp b/projects/miopen/test/gemm.hpp
index 34fa7db11bec..be0195545352 100644
--- a/projects/miopen/test/gemm.hpp
+++ b/projects/miopen/test/gemm.hpp
@@ -1,5 +1,2 @@
 // Forwarding header — implementation moved to miopen_utils.
-#ifndef GUARD_GEMM_HPP
-#define GUARD_GEMM_HPP
 #include <miopen_utils/gemm.hpp>
-#endif
diff --git a/projects/miopen/test/network_data.hpp b/projects/miopen/test/network_data.hpp
index 7a0dbcd702dd..18e85973ef3f 100644
--- a/projects/miopen/test/network_data.hpp
+++ b/projects/miopen/test/network_data.hpp
@@ -1,5 +1,2 @@
 // Forwarding header — implementation moved to miopen_utils.
-#ifndef GUARD_MIOPEN_TEST_NETWORK_DATA_HPP
-#define GUARD_MIOPEN_TEST_NETWORK_DATA_HPP
 #include <miopen_utils/network_data.hpp>
-#endif
diff --git a/projects/miopen/test/random.hpp b/projects/miopen/test/random.hpp
index 7c5c0efa5962..3bb99a37d6c9 100644
--- a/projects/miopen/test/random.hpp
+++ b/projects/miopen/test/random.hpp
@@ -1,5 +1,2 @@
 // Forwarding header — implementation moved to miopen_utils.
-#ifndef GUARD_MIOPEN_TEST_RANDOM_HPP
-#define GUARD_MIOPEN_TEST_RANDOM_HPP
 #include <miopen_utils/random.hpp>
-#endif
diff --git a/projects/miopen/test/rnn_util.hpp b/projects/miopen/test/rnn_util.hpp
index 2a25f35e61a8..0e771bfdfff1 100644
--- a/projects/miopen/test/rnn_util.hpp
+++ b/projects/miopen/test/rnn_util.hpp
@@ -1,5 +1,2 @@
 // Forwarding header — implementation moved to miopen_utils.
-#ifndef MIOPEN_RNN_UTIL_H_
-#define MIOPEN_RNN_UTIL_H_
 #include <miopen_utils/rnn_util.hpp>
-#endif
diff --git a/projects/miopen/test/serialize.hpp b/projects/miopen/test/serialize.hpp
index b9e948307a1e..c3eb459c38df 100644
--- a/projects/miopen/test/serialize.hpp
+++ b/projects/miopen/test/serialize.hpp
@@ -1,5 +1,2 @@
 // Forwarding header — implementation moved to miopen_utils.
-#ifndef MIOPEN_GUARD_TEST_SERIALIZE_HPP
-#define MIOPEN_GUARD_TEST_SERIALIZE_HPP
 #include <miopen_utils/serialize.hpp>
-#endif
diff --git a/projects/miopen/test/tensor_holder.hpp b/projects/miopen/test/tensor_holder.hpp
index 5f075eb9b528..bc10b5a8b12d 100644
--- a/projects/miopen/test/tensor_holder.hpp
+++ b/projects/miopen/test/tensor_holder.hpp
@@ -1,5 +1,2 @@
 // Forwarding header — implementation moved to miopen_utils.
-#ifndef GUARD_TENSOR_HOLDER_HPP
-#define GUARD_TENSOR_HOLDER_HPP
 #include <miopen_utils/tensor_holder.hpp>
-#endif
diff --git a/projects/miopen/test/verify.hpp b/projects/miopen/test/verify.hpp
index 2bf12f1057a3..8807b5ecfe2b 100644
--- a/projects/miopen/test/verify.hpp
+++ b/projects/miopen/test/verify.hpp
@@ -1,5 +1,2 @@
 // Forwarding header — implementation moved to miopen_utils.
-#ifndef GUARD_VERIFY_HPP
-#define GUARD_VERIFY_HPP
 #include <miopen_utils/verify.hpp>
-#endif

From 64e5d404aa3c16a957140e4c5be59a990ecaac26 Mon Sep 17 00:00:00 2001
From: Brad Pepers <brad.pepers@amd.com>
Date: Sat, 9 May 2026 12:10:10 -0600
Subject: [PATCH 07/11] Add missing common_utils/miopen_utils linkage to gtest
 targets

The forwarding headers in src/include/miopen/ include <common_utils/...>
but the gtest build targets were not linking miopen_common_utils, so the
include directory was not on the search path. This caused build failures
for all gtest targets.

Co-Authored-By: Claude Sonnet 4 <noreply@anthropic.com>
---
 projects/miopen/test/gtest/CMakeLists.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/projects/miopen/test/gtest/CMakeLists.txt b/projects/miopen/test/gtest/CMakeLists.txt
index af74113fa312..dfdb6ef4630e 100644
--- a/projects/miopen/test/gtest/CMakeLists.txt
+++ b/projects/miopen/test/gtest/CMakeLists.txt
@@ -81,7 +81,7 @@ function(add_gtest TEST_NAME TEST_CPP)
   # Workaround : change in rocm-cmake was causing linking error so had to add ${CMAKE_DL_LIBS}
   #               We can remove ${CMAKE_DL_LIBS} once root cause is identified.
   # MIOpen_with_plugins ensures CK plugin .so's are built alongside the test
-  target_link_libraries(${TEST_NAME} ${CMAKE_DL_LIBS} GTest::gtest GTest::gtest_main GTest::gmock MIOpen_with_plugins hip::host )
+  target_link_libraries(${TEST_NAME} ${CMAKE_DL_LIBS} GTest::gtest GTest::gtest_main GTest::gmock MIOpen_with_plugins hip::host miopen_common_utils miopen_utils)
   if(NOT MIOPEN_EMBED_DB STREQUAL "")
       target_link_libraries(${TEST_NAME} $<BUILD_INTERFACE:miopen_data>)
   endif()
@@ -211,7 +211,7 @@ endforeach()
 # Otherwise, all files in ${SOURCES} are rebuilt for each test.
 add_library(miopen_gtest_common STATIC ${SOURCES})
 target_include_directories(miopen_gtest_common PRIVATE ../ ../../src/kernels)
-target_link_libraries(miopen_gtest_common PRIVATE GTest::gtest GTest::gmock MIOpen)
+target_link_libraries(miopen_gtest_common PRIVATE GTest::gtest GTest::gmock MIOpen miopen_common_utils miopen_utils)
 if(WIN32)
   # Refer to https://en.cppreference.com/w/cpp/language/types for details.
   target_compile_options(miopen_gtest_common PRIVATE $<BUILD_INTERFACE:$<$<CXX_COMPILER_ID:Clang>:-U__LP64__>>)

From 896cbbc95bb4b6e7b9fd405e2e8dd9b72b90ff45 Mon Sep 17 00:00:00 2001
From: Brad Pepers <brad.pepers@amd.com>
Date: Sat, 9 May 2026 13:04:46 -0600
Subject: [PATCH 08/11] Add missing common_utils linkage to ck_impl and
 speedtest targets

Same fix as the previous gtest commit: the forwarding headers in
src/include/miopen/ resolve to common_utils/ headers, so any target
that includes MIOpen internals needs miopen_common_utils on its
include path.

Co-Authored-By: Claude Sonnet 4 <noreply@anthropic.com>
---
 projects/miopen/speedtests/CMakeLists.txt  | 2 +-
 projects/miopen/src/ck_impl/CMakeLists.txt | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/projects/miopen/speedtests/CMakeLists.txt b/projects/miopen/speedtests/CMakeLists.txt
index 9aa89974cc75..826da17b59db 100644
--- a/projects/miopen/speedtests/CMakeLists.txt
+++ b/projects/miopen/speedtests/CMakeLists.txt
@@ -16,7 +16,7 @@ function(add_speedtest_executable TEST_NAME)
     endif()
     separate_arguments(MIOPEN_TEST_FLAGS_ARGS NATIVE_COMMAND ${MIOPEN_TEST_FLAGS})
     # MIOpen_with_plugins ensures CK plugin .so's are built alongside the speedtest
-    target_link_libraries(${TEST_NAME} MIOpen_with_plugins)
+    target_link_libraries(${TEST_NAME} MIOpen_with_plugins miopen_common_utils miopen_utils)
     target_include_directories(${TEST_NAME} PRIVATE ../test ../src/kernels)
 endfunction(add_speedtest_executable)
 
diff --git a/projects/miopen/src/ck_impl/CMakeLists.txt b/projects/miopen/src/ck_impl/CMakeLists.txt
index ae380f174007..791250958533 100644
--- a/projects/miopen/src/ck_impl/CMakeLists.txt
+++ b/projects/miopen/src/ck_impl/CMakeLists.txt
@@ -145,7 +145,7 @@ foreach(gpu_target IN LISTS _CK_FILTERED_TARGETS)
     target_link_libraries(${lib_name} PRIVATE hip::device)
 
     # Link against MIOpen for shared types (ConvSolution, InvokerFactory, etc.)
-    target_link_libraries(${lib_name} PRIVATE MIOpen)
+    target_link_libraries(${lib_name} PRIVATE MIOpen miopen_common_utils)
 
     # Install alongside MIOpen
     install(TARGETS ${lib_name}

From 70381cf345ad81221ffbdf4faa3ce066eb4919c1 Mon Sep 17 00:00:00 2001
From: Brad Pepers <brad.pepers@amd.com>
Date: Sat, 9 May 2026 16:19:47 -0600
Subject: [PATCH 09/11] Fix unified miopen_gtest build: restore lost transitive
 includes

The forwarding headers and removed driver/ cross-includes broke 8
test files in the unified miopen_gtest binary:

- test/fusionHost.hpp: add back get_handle.hpp include that the
  miopen_utils version correctly omits but test code depends on
- reduceextreme.hpp, reducecalculation.hpp: move miopen/miopen.h
  before kernel headers that static_assert on its macros
- layout_transpose.cpp: add float16 typedef lost when the
  driver/conv_common.hpp cross-include was removed

Co-Authored-By: Claude Sonnet 4 <noreply@anthropic.com>
---
 projects/miopen/test/fusionHost.hpp              | 1 +
 projects/miopen/test/gtest/layout_transpose.cpp  | 2 ++
 projects/miopen/test/gtest/reducecalculation.hpp | 2 +-
 projects/miopen/test/gtest/reduceextreme.hpp     | 2 +-
 4 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/projects/miopen/test/fusionHost.hpp b/projects/miopen/test/fusionHost.hpp
index c95d14da6f82..11c6d54f6257 100644
--- a/projects/miopen/test/fusionHost.hpp
+++ b/projects/miopen/test/fusionHost.hpp
@@ -1,2 +1,3 @@
 // Forwarding header — implementation moved to miopen_utils.
 #include <miopen_utils/fusionHost.hpp>
+#include "get_handle.hpp"
diff --git a/projects/miopen/test/gtest/layout_transpose.cpp b/projects/miopen/test/gtest/layout_transpose.cpp
index b4c86a99846a..b688d17b2aa7 100644
--- a/projects/miopen/test/gtest/layout_transpose.cpp
+++ b/projects/miopen/test/gtest/layout_transpose.cpp
@@ -37,6 +37,8 @@
 
 #include <vector>
 
+using float16 = half_float::half;
+
 namespace {
 
 template <typename T>
diff --git a/projects/miopen/test/gtest/reducecalculation.hpp b/projects/miopen/test/gtest/reducecalculation.hpp
index 94b70ac8a1ea..3b2de8465c0c 100644
--- a/projects/miopen/test/gtest/reducecalculation.hpp
+++ b/projects/miopen/test/gtest/reducecalculation.hpp
@@ -24,13 +24,13 @@
  *
  *******************************************************************************/
 
+#include <miopen/miopen.h>
 #include "../src/kernels/MIOpenReduceCalculation.hpp"
 #include "get_handle.hpp"
 #include "random.hpp"
 #include "tensor_holder.hpp"
 #include "verify.hpp"
 #include <gtest/gtest.h>
-#include <miopen/miopen.h>
 #include <miopen/reducecalculation.hpp>
 
 template <typename T, ReduceCalculationOp_t op>
diff --git a/projects/miopen/test/gtest/reduceextreme.hpp b/projects/miopen/test/gtest/reduceextreme.hpp
index 4d2658a39569..0c2cde8c7564 100644
--- a/projects/miopen/test/gtest/reduceextreme.hpp
+++ b/projects/miopen/test/gtest/reduceextreme.hpp
@@ -24,6 +24,7 @@
  *
  *******************************************************************************/
 
+#include <miopen/miopen.h>
 #include "../src/kernels/MIOpenReduceExtreme.hpp"
 #include "get_handle.hpp"
 #include "random.hpp"
@@ -31,7 +32,6 @@
 #include "verify.hpp"
 #include <gtest/gtest.h>
 #include <miopen/reduceextreme.hpp>
-#include <miopen/miopen.h>
 
 template <typename T>
 bool compare_equal(T r1, T r2)

From c4e8de36534de8f562bca7001438564978308637 Mon Sep 17 00:00:00 2001
From: Brad Pepers <brad.pepers@amd.com>
Date: Sun, 10 May 2026 14:29:48 -0600
Subject: [PATCH 10/11] Clarify common_utils and miopen_utils are internal
 build-only libraries

Remove MIOpen:: namespace aliases (implies installed/exported targets) and
add EXCLUDE_FROM_ALL to both INTERFACE libraries. Strengthen CMake comments
to be explicit that these are not installed, not exported, and not part of
the public MIOpen API.

No functional or build behavior changes.

Co-Authored-By: Claude Sonnet 4 <noreply@anthropic.com>
---
 projects/miopen/common_utils/CMakeLists.txt | 9 ++++++---
 projects/miopen/miopen_utils/CMakeLists.txt | 8 +++++---
 2 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/projects/miopen/common_utils/CMakeLists.txt b/projects/miopen/common_utils/CMakeLists.txt
index 1afb185255c9..7bb6572ee7ac 100644
--- a/projects/miopen/common_utils/CMakeLists.txt
+++ b/projects/miopen/common_utils/CMakeLists.txt
@@ -24,13 +24,16 @@
 #
 ################################################################################
 
-# Header-only utility library shared by MIOpen, MIOpenDriver, and tests.
-# Contains pure C++ utilities with NO MIOpen or GPU dependencies.
+# INTERNAL BUILD-ONLY library — not installed, not exported, not part of the public MIOpen API.
+# Header-only pure C++ utilities shared by MIOpen, MIOpenDriver, and tests.
+# Contains NO MIOpen or GPU dependencies.
+# Do NOT add install(TARGETS miopen_common_utils ...) — headers live in the build tree only.
 
 add_library(miopen_common_utils INTERFACE)
-add_library(MIOpen::common_utils ALIAS miopen_common_utils)
+set_target_properties(miopen_common_utils PROPERTIES EXCLUDE_FROM_ALL TRUE)
 
 target_include_directories(miopen_common_utils INTERFACE
+    # BUILD_INTERFACE only — no install interface; these headers are not installed.
     $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
 )
 
diff --git a/projects/miopen/miopen_utils/CMakeLists.txt b/projects/miopen/miopen_utils/CMakeLists.txt
index 47e61c063411..e93a717d0a0e 100644
--- a/projects/miopen/miopen_utils/CMakeLists.txt
+++ b/projects/miopen/miopen_utils/CMakeLists.txt
@@ -24,14 +24,16 @@
 #
 ################################################################################
 
-# Utility library for MIOpen test/verification code shared by MIOpenDriver and tests.
+# INTERNAL BUILD-ONLY library — not installed, not exported, not part of the public MIOpen API.
+# Shared verification/test utilities for MIOpenDriver and tests.
 # Depends on common_utils and the MIOpen public API (miopen.h).
-# Phase 1: May still use MIOpen internal headers temporarily.
+# Do NOT add install(TARGETS miopen_utils ...) — headers live in the build tree only.
 
 add_library(miopen_utils INTERFACE)
-add_library(MIOpen::miopen_utils ALIAS miopen_utils)
+set_target_properties(miopen_utils PROPERTIES EXCLUDE_FROM_ALL TRUE)
 
 target_include_directories(miopen_utils INTERFACE
+    # BUILD_INTERFACE only — no install interface; these headers are not installed.
     $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
 )
 

From 8d14b9691b2820e244f08670f57b123329d285e3 Mon Sep 17 00:00:00 2001
From: Brad Pepers <brad.pepers@amd.com>
Date: Sun, 10 May 2026 14:44:09 -0600
Subject: [PATCH 11/11] Move MIOPEN_USE_RNE_BFLOAT16 option to top-level
 CMakeLists.txt

The option was declared in two places (common_utils/ and src/), with the
src/ declaration being a silent no-op since common_utils/ runs first.

Move the single option() declaration to the top-level CMakeLists.txt,
which is the canonical location for all project-wide MIOpen build options.
Both common_utils/ and src/ now consume it from the CMake cache without
re-declaring it.

No functional or build behavior changes.

Co-Authored-By: Claude Sonnet 4 <noreply@anthropic.com>
---
 projects/miopen/CMakeLists.txt              | 7 +++++++
 projects/miopen/common_utils/CMakeLists.txt | 5 ++---
 projects/miopen/src/CMakeLists.txt          | 9 +--------
 3 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/projects/miopen/CMakeLists.txt b/projects/miopen/CMakeLists.txt
index 26bf20fd0690..57089253f3e1 100644
--- a/projects/miopen/CMakeLists.txt
+++ b/projects/miopen/CMakeLists.txt
@@ -110,6 +110,13 @@ if(MIOPEN_INCBIN)
     enable_language(ASM)
 endif()
 
+# Truncation rounding or (default) rounding to nearest even (RNE) is enabled.
+# This switch controls two related but different aspects of MIOpen behavior:
+#  1. How host code performs conversions of float to bfloat16 (important for testing).
+#  2. How BF16 kernels perform the final conversion (and rounding) of FP32 to BF16 results
+#     (affects the main functionality of the library).
+option(MIOPEN_USE_RNE_BFLOAT16 "Sets rounding scheme for bfloat16 type" ON)
+
 # Strip symbols for release
 if(MIOPEN_STRIP_SYMBOLS AND NOT WIN32 AND NOT APPLE)
     set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -s")
diff --git a/projects/miopen/common_utils/CMakeLists.txt b/projects/miopen/common_utils/CMakeLists.txt
index 7bb6572ee7ac..d538ef6ef258 100644
--- a/projects/miopen/common_utils/CMakeLists.txt
+++ b/projects/miopen/common_utils/CMakeLists.txt
@@ -37,9 +37,8 @@ target_include_directories(miopen_common_utils INTERFACE
     $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
 )
 
-# bfloat16.hpp needs to know the rounding mode.
-# This option is also defined in src/CMakeLists.txt for backward compatibility.
-option(MIOPEN_USE_RNE_BFLOAT16 "Sets rounding scheme for bfloat16 type" ON)
+# bfloat16.hpp needs MIOPEN_USE_RNE_BFLOAT16 at compile time.
+# The option is declared in the top-level CMakeLists.txt.
 if(MIOPEN_USE_RNE_BFLOAT16)
     target_compile_definitions(miopen_common_utils INTERFACE MIOPEN_USE_RNE_BFLOAT16=1)
 else()
diff --git a/projects/miopen/src/CMakeLists.txt b/projects/miopen/src/CMakeLists.txt
index 3ba48b6ca763..84bbd53716fb 100644
--- a/projects/miopen/src/CMakeLists.txt
+++ b/projects/miopen/src/CMakeLists.txt
@@ -8,14 +8,7 @@ if(MIOPEN_ENABLE_SQLITE)
     add_subdirectory(sqlite)
 endif()
 
-# Truncation rounding or (default) rounding to nearest even (RNE) is enabled.
-# This switch controls two related but different aspects of MIOpen behavior
-# 1.  How host code performs conversions of float to bfloat16, important only
-#     for testing.
-# 2.  How BF16 kernels (which are kind of mixed-precision now and expected to
-#     remain in the future)  perform final conversion (and rounding) of FP32
-#     to BF16 results. This affects the main functionality of the library.
-option( MIOPEN_USE_RNE_BFLOAT16 "Sets rounding scheme for bfloat16 type" ON )
+# MIOPEN_USE_RNE_BFLOAT16 is declared in the top-level CMakeLists.txt.
 option( MIOPEN_FP8_IEEE_EXPONENT_BIAS "Sets the FP8 exponent bias to IEEE" OFF)
 option( MIOPEN_FP8_CLIPPING "Sets the FP8 clipping" ON)