From 2f60cb0dbb62c74bab0d9c4756b1b0e92da4bcdd Mon Sep 17 00:00:00 2001 From: Brad Pepers Date: Sat, 9 May 2026 07:08:04 -0600 Subject: [PATCH 01/11] Create common_utils shared utility library and move pure utilities Introduce a header-only common_utils library for pure C++ utilities shared by MIOpen library, MIOpenDriver, and tests. This is the first step toward a layered architecture that eliminates circular dependencies between driver, test, and library code. Move 9 utility headers from src/include/miopen/ to common_utils/include/common_utils/: - rank.hpp, returns.hpp, algorithm.hpp (zero-dependency) - float_equal.hpp, each_args.hpp, type_name.hpp, par_for.hpp - functional.hpp, ford.hpp (depend on other moved utilities) Original locations retain thin forwarding headers for backward compatibility. All internal cross-references within moved headers updated to use common_utils/ paths. CMake: common_utils added as INTERFACE library, linked by MIOpen, MIOpenDriver, and test targets. Co-Authored-By: Claude Sonnet 4 --- projects/miopen/CMakeLists.txt | 1 + projects/miopen/common_utils/CMakeLists.txt | 35 ++++ .../include/common_utils/algorithm.hpp | 47 ++++++ .../include/common_utils/each_args.hpp | 79 ++++++++++ .../include/common_utils/float_equal.hpp | 89 +++++++++++ .../include/common_utils/ford.hpp | 122 ++++++++++++++ .../include/common_utils/functional.hpp | 131 +++++++++++++++ .../include/common_utils/par_for.hpp | 149 ++++++++++++++++++ .../include/common_utils/rank.hpp | 42 +++++ .../include/common_utils/returns.hpp | 38 +++++ .../include/common_utils/type_name.hpp | 139 ++++++++++++++++ projects/miopen/driver/CMakeLists.txt | 2 +- projects/miopen/src/CMakeLists.txt | 2 +- .../miopen/src/include/miopen/algorithm.hpp | 21 +-- .../miopen/src/include/miopen/each_args.hpp | 53 +------ .../miopen/src/include/miopen/float_equal.hpp | 63 +------- projects/miopen/src/include/miopen/ford.hpp | 121 +------------- .../miopen/src/include/miopen/functional.hpp | 130 +-------------- .../miopen/src/include/miopen/par_for.hpp | 123 +-------------- projects/miopen/src/include/miopen/rank.hpp | 16 +- .../miopen/src/include/miopen/returns.hpp | 12 +- .../miopen/src/include/miopen/type_name.hpp | 113 +------------ projects/miopen/test/CMakeLists.txt | 4 +- 23 files changed, 894 insertions(+), 638 deletions(-) create mode 100644 projects/miopen/common_utils/CMakeLists.txt create mode 100644 projects/miopen/common_utils/include/common_utils/algorithm.hpp create mode 100644 projects/miopen/common_utils/include/common_utils/each_args.hpp create mode 100644 projects/miopen/common_utils/include/common_utils/float_equal.hpp create mode 100644 projects/miopen/common_utils/include/common_utils/ford.hpp create mode 100644 projects/miopen/common_utils/include/common_utils/functional.hpp create mode 100644 projects/miopen/common_utils/include/common_utils/par_for.hpp create mode 100644 projects/miopen/common_utils/include/common_utils/rank.hpp create mode 100644 projects/miopen/common_utils/include/common_utils/returns.hpp create mode 100644 projects/miopen/common_utils/include/common_utils/type_name.hpp diff --git a/projects/miopen/CMakeLists.txt b/projects/miopen/CMakeLists.txt index af87cd1c7e16..627ddec85bbd 100644 --- a/projects/miopen/CMakeLists.txt +++ b/projects/miopen/CMakeLists.txt @@ -894,6 +894,7 @@ set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/bin) if(NOT MIOPEN_USE_SQLITE_PERFDB) add_subdirectory(tools/sqlite2txt) endif() +add_subdirectory(common_utils) add_subdirectory(addkernels) add_subdirectory(src) if(MIOPEN_BUILD_DRIVER) diff --git a/projects/miopen/common_utils/CMakeLists.txt b/projects/miopen/common_utils/CMakeLists.txt new file mode 100644 index 000000000000..c0f4620a3439 --- /dev/null +++ b/projects/miopen/common_utils/CMakeLists.txt @@ -0,0 +1,35 @@ +################################################################################ +# +# MIT License +# +# Copyright (c) 2025 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +################################################################################ + +# Header-only utility library shared by MIOpen, MIOpenDriver, and tests. +# Contains pure C++ utilities with NO MIOpen or GPU dependencies. + +add_library(miopen_common_utils INTERFACE) +add_library(MIOpen::common_utils ALIAS miopen_common_utils) + +target_include_directories(miopen_common_utils INTERFACE + $ +) diff --git a/projects/miopen/common_utils/include/common_utils/algorithm.hpp b/projects/miopen/common_utils/include/common_utils/algorithm.hpp new file mode 100644 index 000000000000..d1098a066077 --- /dev/null +++ b/projects/miopen/common_utils/include/common_utils/algorithm.hpp @@ -0,0 +1,47 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2019 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#ifndef GUARD_MLOPEN_ALGORITHM_HPP +#define GUARD_MLOPEN_ALGORITHM_HPP + +#include + +namespace miopen { + +template +bool any_of(const Range& r, Predicate p) +{ + return std::any_of(r.begin(), r.end(), p); +} + +template +bool all_of(const Range& r, Predicate p) +{ + return std::all_of(r.begin(), r.end(), p); +} + +} // namespace miopen + +#endif diff --git a/projects/miopen/common_utils/include/common_utils/each_args.hpp b/projects/miopen/common_utils/include/common_utils/each_args.hpp new file mode 100644 index 000000000000..e078153dc998 --- /dev/null +++ b/projects/miopen/common_utils/include/common_utils/each_args.hpp @@ -0,0 +1,79 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2017 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#ifndef GUARD_MIOPEN_EACH_ARGS_HPP +#define GUARD_MIOPEN_EACH_ARGS_HPP + +#include +#include +#include + +namespace miopen { +namespace detail { + +template +void each_args_i_impl(F f, std::index_sequence, Ts&&... xs) +{ + (void)std::initializer_list{ + (f(std::integral_constant{}, std::forward(xs)), 0)...}; +} + +template +auto unpack_impl(F f, std::index_sequence, T&& x) +{ + return f(std::get(x)...); +} + +} // namespace detail + +template +void each_args_i(F f, Ts&&... xs) +{ + detail::each_args_i_impl(f, std::make_index_sequence(), std::forward(xs)...); +} + +template +void each_args(F f, Ts&&... xs) +{ + (void)std::initializer_list{(f(std::forward(xs)), 0)...}; +} + +// Workaround for gcc warnings +template +void each_args(F) +{ +} + +template +auto unpack(F f, T&& x) +{ + using type = typename std::remove_cv::type>::type; + return detail::unpack_impl( + f, std::make_index_sequence::value>(), std::forward(x)); +} + +} // namespace miopen + +#endif diff --git a/projects/miopen/common_utils/include/common_utils/float_equal.hpp b/projects/miopen/common_utils/include/common_utils/float_equal.hpp new file mode 100644 index 000000000000..24bbdc55ad11 --- /dev/null +++ b/projects/miopen/common_utils/include/common_utils/float_equal.hpp @@ -0,0 +1,89 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2017 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#ifndef GUARD_MLOPEN_FLOAT_EQUAL_HPP +#define GUARD_MLOPEN_FLOAT_EQUAL_HPP + +#include +#include +#include +#include + +namespace miopen { + +template +using common_type = typename std::common_type::type; + +struct float_equal_fn +{ + template + static bool apply(T x, T y) + { + // The standard library from MSVC does not implement std::isfinite() for integer + // types - no additional overloads are provided. According to the documentation, + // integer types should be treaded as doubles. + // Refer to https://en.cppreference.com/w/cpp/numeric/math/isfinite for more information. + return std::isfinite(static_cast(x)) and std::isfinite(static_cast(y)) and + std::nextafter(x, std::numeric_limits::lowest()) <= y and + std::nextafter(x, std::numeric_limits::max()) >= y; + } + + template + bool operator()(T x, U y) const + { + return float_equal_fn::apply>(x, y); + } +}; + +static constexpr float_equal_fn float_equal{}; + +/// Special case for comparing with a sentinel value +struct float_equal_sentinel_fn +{ + template + static bool apply(T x, T y) + { +// In this case we have to ignore this warning, because we intend to compare with the exact value +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wfloat-equal" + bool equals_sentinel = x == y; +#pragma clang diagnostic pop + + return std::isfinite(static_cast(x)) and std::isfinite(static_cast(y)) and + equals_sentinel; + } + + template + bool operator()(T x, U y) const + { + return float_equal_sentinel_fn::apply>(x, y); + } +}; + +static constexpr float_equal_sentinel_fn float_equal_sentinel{}; + +} // namespace miopen + +#endif diff --git a/projects/miopen/common_utils/include/common_utils/ford.hpp b/projects/miopen/common_utils/include/common_utils/ford.hpp new file mode 100644 index 000000000000..4ff4ddfa32e2 --- /dev/null +++ b/projects/miopen/common_utils/include/common_utils/ford.hpp @@ -0,0 +1,122 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2017 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#ifndef GUARD_FORD_HPP +#define GUARD_FORD_HPP + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +namespace miopen { + +// An improved async, that doesn't block +template +std::future::type> detach_async(Function&& f) +{ + using result_type = typename std::invoke_result::type; + std::packaged_task task(std::forward(f)); + auto fut = task.get_future(); + std::thread(std::move(task)).detach(); + return fut; +} + +template +auto then(std::future f, Work w) -> std::future +{ + return std::async(std::launch::deferred, + [=, f_ = std::move(f)]() mutable { return w(f_.get()); }); +} + +template +struct ford_wrapper +{ + template + auto operator()(Ts... xs) const MIOPEN_RETURNS(std::bind(T{}, std::placeholders::_1, xs...)); +}; + +// Multidimensional for loop +struct ford_impl +{ + template + void operator()(F f) const + { + f(); + } + + template + void operator()(F f, T x, Ts... xs) const + { + // Workaround for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=55914 + for(T i = 0; i < x; i++) + { + (*this)([&](Ts... is) { f(i, is...); }, xs...); + } + } +}; + +static constexpr ford_wrapper ford{}; + +struct par_ford_impl +{ + template + void operator()(F f, Ts... xs) const + { + using array_type = std::array; + array_type lens = {{static_cast(xs)...}}; + array_type strides; + strides.fill(1); + std::partial_sum( + lens.rbegin(), lens.rend() - 1, strides.rbegin() + 1, std::multiplies()); + auto size = std::accumulate( + lens.begin(), lens.end(), static_cast(1), std::multiplies()); + par_for(size, [&](std::size_t i) { + array_type indices; + std::transform(strides.begin(), + strides.end(), + lens.begin(), + indices.begin(), + [&](size_t stride, size_t len) { return (i / stride) % len; }); + unpack(f, indices); + }); + } +}; + +static constexpr ford_wrapper par_ford{}; + +} // namespace miopen + +#endif diff --git a/projects/miopen/common_utils/include/common_utils/functional.hpp b/projects/miopen/common_utils/include/common_utils/functional.hpp new file mode 100644 index 000000000000..19dde2bd28dc --- /dev/null +++ b/projects/miopen/common_utils/include/common_utils/functional.hpp @@ -0,0 +1,131 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2017 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#ifndef GUARD_MLOPEN_FUNCTIONAL_HPP +#define GUARD_MLOPEN_FUNCTIONAL_HPP + +#include +#include +#include + +namespace miopen { +namespace detail { + +template +auto each_i_impl(F f, std::index_sequence) + MIOPEN_RETURNS(f(std::integral_constant{}...)); +} // namespace detail + +template +struct by_t +{ + F f; + P p; + template + auto operator()(Ts&&... xs) const MIOPEN_RETURNS(f(p(std::forward(xs))...)) +}; + +template +by_t by(F f, P p) +{ + return {std::move(f), std::move(p)}; +} + +template +struct compose_t +{ + F f; + G g; + template + auto operator()(Ts&&... xs) const MIOPEN_RETURNS(f(g(std::forward(xs)...))) +}; + +template +compose_t compose(F f, G g) +{ + return {std::move(f), std::move(g)}; +} + +template +struct flip_t +{ + F f; + template + auto operator()(T&& x, U&& y) const MIOPEN_RETURNS(f(std::forward(y), std::forward(x))) +}; + +template +flip_t flip(F f) +{ + return {std::move(f)}; +} + +template +struct sequence_t +{ + F f; + template + auto operator()(IntegralConstant) const + MIOPEN_RETURNS(detail::each_i_impl(f, std::make_index_sequence())); +}; + +template +sequence_t sequence(F f) +{ + return {std::move(f)}; +} + +template +void repeat_n(F f, std::integral_constant) +{ + auto fs = [&f](auto... is) { return each_args(f, is...); }; + sequence(fs)(std::integral_constant{}); +} + +template +struct cast_to +{ + template + T operator()(X&& x) const + { + return static_cast(std::forward(x)); + } +}; + +template +auto unpacker(F f) +{ + return [=](auto xs) { return miopen::unpack(f, xs); }; +}; + +template +auto prepender(F f, Xs... xs) +{ + return [=](auto... ys) { return f(xs..., ys...); }; +} + +} // namespace miopen + +#endif diff --git a/projects/miopen/common_utils/include/common_utils/par_for.hpp b/projects/miopen/common_utils/include/common_utils/par_for.hpp new file mode 100644 index 000000000000..1272dcf6ac9b --- /dev/null +++ b/projects/miopen/common_utils/include/common_utils/par_for.hpp @@ -0,0 +1,149 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#ifndef MIOPEN_GUARD_MLOPEN_PAR_FOR_HPP +#define MIOPEN_GUARD_MLOPEN_PAR_FOR_HPP + +#include +#include +#include +#include +#include +#include + +#include + +namespace miopen { + +struct joinable_thread : std::thread +{ + template + joinable_thread(Xs&&... xs) : std::thread(std::forward(xs)...) // NOLINT + { + } + + joinable_thread& operator=(joinable_thread&& other) = default; + joinable_thread(joinable_thread&& other) = default; + + ~joinable_thread() + { + if(this->joinable()) + this->join(); + } +}; + +struct thread_factory +{ + template + joinable_thread operator()(std::size_t& work, std::size_t n, std::size_t grainsize, F f) const + { + auto result = joinable_thread([=] { + std::size_t start = work; + std::size_t last = std::min(n, work + grainsize); + for(std::size_t i = start; i < last; i++) + { + f(i); + } + }); + work += grainsize; + return result; + } +}; + +template +void par_for_impl(std::size_t n, std::size_t threadsize, F f) +{ + if(threadsize <= 1) + { + for(std::size_t i = 0; i < n; i++) + f(i); + } + else + { + std::vector threads(threadsize); + const std::size_t grainsize = std::ceil(static_cast(n) / threads.size()); + + std::size_t work = 0; + std::generate(threads.begin(), + threads.end(), + std::bind(thread_factory{}, std::ref(work), n, grainsize, f)); + assert(work >= n); + } +} + +template +void par_for(std::size_t n, std::size_t min_grain, F f) +{ + const auto threadsize = + std::min(std::thread::hardware_concurrency(), n / min_grain); + par_for_impl(n, threadsize, f); +} + +struct min_grain +{ + std::size_t n = 0; +}; + +template +void par_for(std::size_t n, min_grain mg, F f) +{ + const auto threadsize = std::min(std::thread::hardware_concurrency(), n / mg.n); + par_for_impl(n, threadsize, f); +} + +template +void par_for(std::size_t n, F f) +{ + par_for(n, min_grain{8}, f); +} + +struct max_threads +{ + std::size_t n = 0; +}; + +template +void par_for(std::size_t n, max_threads mt, F f) +{ + const auto threadsize = std::min(std::thread::hardware_concurrency(), mt.n); + par_for_impl(n, std::min(threadsize, n), f); +} + +template +void par_for_strided(std::size_t n, max_threads mt, F f) +{ + auto threadsize = std::min(std::thread::hardware_concurrency(), mt.n); + par_for_impl(threadsize, threadsize, [&](auto start) { + for(std::size_t i = start; i < n; i += threadsize) + { + f(i); + } + }); +} + +} // namespace miopen + +#endif diff --git a/projects/miopen/common_utils/include/common_utils/rank.hpp b/projects/miopen/common_utils/include/common_utils/rank.hpp new file mode 100644 index 000000000000..013ec6e7f7f4 --- /dev/null +++ b/projects/miopen/common_utils/include/common_utils/rank.hpp @@ -0,0 +1,42 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2017 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#ifndef GUARD_MIOPEN_RANK_HPP +#define GUARD_MIOPEN_RANK_HPP + +namespace miopen { + +template +struct rank : rank +{ +}; + +template <> +struct rank<0> +{ +}; +} // namespace miopen + +#endif diff --git a/projects/miopen/common_utils/include/common_utils/returns.hpp b/projects/miopen/common_utils/include/common_utils/returns.hpp new file mode 100644 index 000000000000..4fdb1db18b87 --- /dev/null +++ b/projects/miopen/common_utils/include/common_utils/returns.hpp @@ -0,0 +1,38 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2017 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#ifndef GUARD_MIOPEN_RETURNS_HPP +#define GUARD_MIOPEN_RETURNS_HPP + +#define MIOPEN_RETURNS(...) \ + ->decltype(__VA_ARGS__) { return __VA_ARGS__; } + +#define MIOPEN_BODY_RETURNS(...) \ + { \ + return __VA_ARGS__; \ + } + +#endif diff --git a/projects/miopen/common_utils/include/common_utils/type_name.hpp b/projects/miopen/common_utils/include/common_utils/type_name.hpp new file mode 100644 index 000000000000..ac7fd2ff6017 --- /dev/null +++ b/projects/miopen/common_utils/include/common_utils/type_name.hpp @@ -0,0 +1,139 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2017-2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#ifndef GUARD_TYPE_NAME_HPP +#define GUARD_TYPE_NAME_HPP + +#include +#include +#if defined(_MSC_VER) && !defined(__clang__) && !defined(__GNUC__) +#include +#endif + +namespace miopen { + +template +constexpr std::string_view type_name() +{ +#if defined(__clang__) || defined(__GNUC__) + // clang or gcc + constexpr auto full_name = std::string_view{__PRETTY_FUNCTION__}; +#elif defined(_MSC_VER) + // msvc + constexpr auto full_name = std::string_view{__FUNCSIG__}; +#endif + + // The substring with the data type name is located within the original string, between the + // prefix and the suffix, with the prefix always not at the beginning of the string and the + // suffix always at the end of the string. +#if defined(__clang__) + // clang + constexpr auto prefix = std::string_view{"[T = "}; + constexpr auto suffix = std::string_view{"]"}; +#elif defined(__GNUC__) + // gcc + constexpr auto prefix = std::string_view{"[with T = "}; + constexpr auto suffix = std::string_view{"; std::string_view = std::basic_string_view]"}; +#elif defined(_MSC_VER) + // msvc + constexpr auto prefix = std::string_view{"type_name<"}; + constexpr auto suffix = std::string_view{">(void)"}; +#endif + + constexpr auto prefix_pos = full_name.find(prefix); + static_assert(prefix_pos != std::string_view::npos); + + constexpr auto suffix_pos = full_name.rfind(suffix); + static_assert(suffix_pos != std::string_view::npos); + static_assert(suffix_pos == full_name.size() - suffix.size()); + + constexpr auto pos = prefix_pos + prefix.size(); + static_assert(pos < suffix_pos); + constexpr auto count = suffix_pos - pos; + + constexpr auto name = full_name.substr(pos, count); + +#if defined(__clang__) || defined(__GNUC__) + // clang or gcc + return name; +#elif defined(_MSC_VER) + // msvc + if constexpr(std::is_compound_v) + { + // For compound data types, the string contains the keyword 'class/struct/union/enum' before + // the data type name, separated by a space. + constexpr auto sep = std::string_view{" "}; + constexpr auto sep_pos = name.find(sep); + static_assert(sep_pos != std::string_view::npos); + static_assert(sep_pos != 0); // must not be at the 0 position + + constexpr auto name_pos = sep_pos + sep.size(); + constexpr auto tname = name.substr(name_pos); + static_assert(tname.size() > 0); + + return tname; + } + else + { + return name; + } +#endif +} + +template +constexpr std::string_view type_name_bare() +{ + constexpr auto name = type_name(); + constexpr auto pos = name.rfind(':'); + if constexpr(pos == std::string_view::npos) + { + constexpr auto result = name; + return result; + } + else + { + constexpr auto bare_name = name.substr(pos + 1); + static_assert(bare_name.size() > 0); + return bare_name; + } +} + +template +const std::string& get_type_name() +{ + static const auto ret = std::string(type_name()); + return ret; +} + +template +const std::string& get_type_name(const T&) +{ + return miopen::get_type_name(); +} + +} // namespace miopen + +#endif diff --git a/projects/miopen/driver/CMakeLists.txt b/projects/miopen/driver/CMakeLists.txt index 4aac2358c432..693a3d47d599 100644 --- a/projects/miopen/driver/CMakeLists.txt +++ b/projects/miopen/driver/CMakeLists.txt @@ -74,7 +74,7 @@ endif() add_dependencies(MIOpenDriver generate_kernels) target_include_directories(MIOpenDriver PRIVATE ../src/kernels) # MIOpen_with_plugins ensures CK plugin .so's are built alongside MIOpenDriver -target_link_libraries(MIOpenDriver PRIVATE MIOpen_with_plugins Threads::Threads roc::rocrand nlohmann_json::nlohmann_json ) +target_link_libraries(MIOpenDriver PRIVATE MIOpen_with_plugins Threads::Threads roc::rocrand nlohmann_json::nlohmann_json miopen_common_utils) if(NOT MIOPEN_EMBED_DB STREQUAL "") target_link_libraries(MIOpenDriver PRIVATE $ ) endif() diff --git a/projects/miopen/src/CMakeLists.txt b/projects/miopen/src/CMakeLists.txt index 9e6f401b7506..3ba48b6ca763 100644 --- a/projects/miopen/src/CMakeLists.txt +++ b/projects/miopen/src/CMakeLists.txt @@ -931,7 +931,7 @@ endif() target_include_directories(MIOpen SYSTEM PUBLIC $) # Workaround : change in rocm-cmake was causing linking error so had to add ${CMAKE_DL_LIBS} # We can remove ${CMAKE_DL_LIBS} once root cause is identified. -target_link_libraries(MIOpen PRIVATE ${CMAKE_DL_LIBS} Threads::Threads BZip2::BZip2) +target_link_libraries(MIOpen PRIVATE ${CMAKE_DL_LIBS} Threads::Threads BZip2::BZip2 miopen_common_utils) miopen_generate_export_header(MIOpen) if(WIN32) diff --git a/projects/miopen/src/include/miopen/algorithm.hpp b/projects/miopen/src/include/miopen/algorithm.hpp index d1098a066077..91b0383b823b 100644 --- a/projects/miopen/src/include/miopen/algorithm.hpp +++ b/projects/miopen/src/include/miopen/algorithm.hpp @@ -23,25 +23,8 @@ * SOFTWARE. * *******************************************************************************/ +// Forwarding header -- implementation moved to common_utils. #ifndef GUARD_MLOPEN_ALGORITHM_HPP #define GUARD_MLOPEN_ALGORITHM_HPP - -#include - -namespace miopen { - -template -bool any_of(const Range& r, Predicate p) -{ - return std::any_of(r.begin(), r.end(), p); -} - -template -bool all_of(const Range& r, Predicate p) -{ - return std::all_of(r.begin(), r.end(), p); -} - -} // namespace miopen - +#include #endif diff --git a/projects/miopen/src/include/miopen/each_args.hpp b/projects/miopen/src/include/miopen/each_args.hpp index e078153dc998..646fd53d263f 100644 --- a/projects/miopen/src/include/miopen/each_args.hpp +++ b/projects/miopen/src/include/miopen/each_args.hpp @@ -23,57 +23,8 @@ * SOFTWARE. * *******************************************************************************/ +// Forwarding header -- implementation moved to common_utils. #ifndef GUARD_MIOPEN_EACH_ARGS_HPP #define GUARD_MIOPEN_EACH_ARGS_HPP - -#include -#include -#include - -namespace miopen { -namespace detail { - -template -void each_args_i_impl(F f, std::index_sequence, Ts&&... xs) -{ - (void)std::initializer_list{ - (f(std::integral_constant{}, std::forward(xs)), 0)...}; -} - -template -auto unpack_impl(F f, std::index_sequence, T&& x) -{ - return f(std::get(x)...); -} - -} // namespace detail - -template -void each_args_i(F f, Ts&&... xs) -{ - detail::each_args_i_impl(f, std::make_index_sequence(), std::forward(xs)...); -} - -template -void each_args(F f, Ts&&... xs) -{ - (void)std::initializer_list{(f(std::forward(xs)), 0)...}; -} - -// Workaround for gcc warnings -template -void each_args(F) -{ -} - -template -auto unpack(F f, T&& x) -{ - using type = typename std::remove_cv::type>::type; - return detail::unpack_impl( - f, std::make_index_sequence::value>(), std::forward(x)); -} - -} // namespace miopen - +#include #endif diff --git a/projects/miopen/src/include/miopen/float_equal.hpp b/projects/miopen/src/include/miopen/float_equal.hpp index 24bbdc55ad11..43bd3d7ab14a 100644 --- a/projects/miopen/src/include/miopen/float_equal.hpp +++ b/projects/miopen/src/include/miopen/float_equal.hpp @@ -23,67 +23,8 @@ * SOFTWARE. * *******************************************************************************/ +// Forwarding header -- implementation moved to common_utils. #ifndef GUARD_MLOPEN_FLOAT_EQUAL_HPP #define GUARD_MLOPEN_FLOAT_EQUAL_HPP - -#include -#include -#include -#include - -namespace miopen { - -template -using common_type = typename std::common_type::type; - -struct float_equal_fn -{ - template - static bool apply(T x, T y) - { - // The standard library from MSVC does not implement std::isfinite() for integer - // types - no additional overloads are provided. According to the documentation, - // integer types should be treaded as doubles. - // Refer to https://en.cppreference.com/w/cpp/numeric/math/isfinite for more information. - return std::isfinite(static_cast(x)) and std::isfinite(static_cast(y)) and - std::nextafter(x, std::numeric_limits::lowest()) <= y and - std::nextafter(x, std::numeric_limits::max()) >= y; - } - - template - bool operator()(T x, U y) const - { - return float_equal_fn::apply>(x, y); - } -}; - -static constexpr float_equal_fn float_equal{}; - -/// Special case for comparing with a sentinel value -struct float_equal_sentinel_fn -{ - template - static bool apply(T x, T y) - { -// In this case we have to ignore this warning, because we intend to compare with the exact value -#pragma clang diagnostic push -#pragma clang diagnostic ignored "-Wfloat-equal" - bool equals_sentinel = x == y; -#pragma clang diagnostic pop - - return std::isfinite(static_cast(x)) and std::isfinite(static_cast(y)) and - equals_sentinel; - } - - template - bool operator()(T x, U y) const - { - return float_equal_sentinel_fn::apply>(x, y); - } -}; - -static constexpr float_equal_sentinel_fn float_equal_sentinel{}; - -} // namespace miopen - +#include #endif diff --git a/projects/miopen/src/include/miopen/ford.hpp b/projects/miopen/src/include/miopen/ford.hpp index f56b20de4d46..0dc62c9ae495 100644 --- a/projects/miopen/src/include/miopen/ford.hpp +++ b/projects/miopen/src/include/miopen/ford.hpp @@ -1,122 +1,5 @@ -/******************************************************************************* - * - * MIT License - * - * Copyright (c) 2017 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - *******************************************************************************/ +// Forwarding header — implementation moved to common_utils. #ifndef GUARD_FORD_HPP #define GUARD_FORD_HPP - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#include - -namespace miopen { - -// An improved async, that doesn't block -template -std::future::type> detach_async(Function&& f) -{ - using result_type = typename std::invoke_result::type; - std::packaged_task task(std::forward(f)); - auto fut = task.get_future(); - std::thread(std::move(task)).detach(); - return fut; -} - -template -auto then(std::future f, Work w) -> std::future -{ - return std::async(std::launch::deferred, - [=, f_ = std::move(f)]() mutable { return w(f_.get()); }); -} - -template -struct ford_wrapper -{ - template - auto operator()(Ts... xs) const MIOPEN_RETURNS(std::bind(T{}, std::placeholders::_1, xs...)); -}; - -// Multidimensional for loop -struct ford_impl -{ - template - void operator()(F f) const - { - f(); - } - - template - void operator()(F f, T x, Ts... xs) const - { - // Workaround for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=55914 - for(T i = 0; i < x; i++) - { - (*this)([&](Ts... is) { f(i, is...); }, xs...); - } - } -}; - -static constexpr ford_wrapper ford{}; - -struct par_ford_impl -{ - template - void operator()(F f, Ts... xs) const - { - using array_type = std::array; - array_type lens = {{static_cast(xs)...}}; - array_type strides; - strides.fill(1); - std::partial_sum( - lens.rbegin(), lens.rend() - 1, strides.rbegin() + 1, std::multiplies()); - auto size = std::accumulate( - lens.begin(), lens.end(), static_cast(1), std::multiplies()); - par_for(size, [&](std::size_t i) { - array_type indices; - std::transform(strides.begin(), - strides.end(), - lens.begin(), - indices.begin(), - [&](size_t stride, size_t len) { return (i / stride) % len; }); - unpack(f, indices); - }); - } -}; - -static constexpr ford_wrapper par_ford{}; - -} // namespace miopen - +#include #endif diff --git a/projects/miopen/src/include/miopen/functional.hpp b/projects/miopen/src/include/miopen/functional.hpp index 02c6e3427e87..d1f7cb973349 100644 --- a/projects/miopen/src/include/miopen/functional.hpp +++ b/projects/miopen/src/include/miopen/functional.hpp @@ -1,131 +1,5 @@ -/******************************************************************************* - * - * MIT License - * - * Copyright (c) 2017 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - *******************************************************************************/ +// Forwarding header — implementation moved to common_utils. #ifndef GUARD_MLOPEN_FUNCTIONAL_HPP #define GUARD_MLOPEN_FUNCTIONAL_HPP - -#include -#include -#include - -namespace miopen { -namespace detail { - -template -auto each_i_impl(F f, std::index_sequence) - MIOPEN_RETURNS(f(std::integral_constant{}...)); -} // namespace detail - -template -struct by_t -{ - F f; - P p; - template - auto operator()(Ts&&... xs) const MIOPEN_RETURNS(f(p(std::forward(xs))...)) -}; - -template -by_t by(F f, P p) -{ - return {std::move(f), std::move(p)}; -} - -template -struct compose_t -{ - F f; - G g; - template - auto operator()(Ts&&... xs) const MIOPEN_RETURNS(f(g(std::forward(xs)...))) -}; - -template -compose_t compose(F f, G g) -{ - return {std::move(f), std::move(g)}; -} - -template -struct flip_t -{ - F f; - template - auto operator()(T&& x, U&& y) const MIOPEN_RETURNS(f(std::forward(y), std::forward(x))) -}; - -template -flip_t flip(F f) -{ - return {std::move(f)}; -} - -template -struct sequence_t -{ - F f; - template - auto operator()(IntegralConstant) const - MIOPEN_RETURNS(detail::each_i_impl(f, std::make_index_sequence())); -}; - -template -sequence_t sequence(F f) -{ - return {std::move(f)}; -} - -template -void repeat_n(F f, std::integral_constant) -{ - auto fs = [&f](auto... is) { return each_args(f, is...); }; - sequence(fs)(std::integral_constant{}); -} - -template -struct cast_to -{ - template - T operator()(X&& x) const - { - return static_cast(std::forward(x)); - } -}; - -template -auto unpacker(F f) -{ - return [=](auto xs) { return miopen::unpack(f, xs); }; -}; - -template -auto prepender(F f, Xs... xs) -{ - return [=](auto... ys) { return f(xs..., ys...); }; -} - -} // namespace miopen - +#include #endif diff --git a/projects/miopen/src/include/miopen/par_for.hpp b/projects/miopen/src/include/miopen/par_for.hpp index 1272dcf6ac9b..71a1125de408 100644 --- a/projects/miopen/src/include/miopen/par_for.hpp +++ b/projects/miopen/src/include/miopen/par_for.hpp @@ -23,127 +23,8 @@ * SOFTWARE. * *******************************************************************************/ - +// Forwarding header -- implementation moved to common_utils. #ifndef MIOPEN_GUARD_MLOPEN_PAR_FOR_HPP #define MIOPEN_GUARD_MLOPEN_PAR_FOR_HPP - -#include -#include -#include -#include -#include -#include - -#include - -namespace miopen { - -struct joinable_thread : std::thread -{ - template - joinable_thread(Xs&&... xs) : std::thread(std::forward(xs)...) // NOLINT - { - } - - joinable_thread& operator=(joinable_thread&& other) = default; - joinable_thread(joinable_thread&& other) = default; - - ~joinable_thread() - { - if(this->joinable()) - this->join(); - } -}; - -struct thread_factory -{ - template - joinable_thread operator()(std::size_t& work, std::size_t n, std::size_t grainsize, F f) const - { - auto result = joinable_thread([=] { - std::size_t start = work; - std::size_t last = std::min(n, work + grainsize); - for(std::size_t i = start; i < last; i++) - { - f(i); - } - }); - work += grainsize; - return result; - } -}; - -template -void par_for_impl(std::size_t n, std::size_t threadsize, F f) -{ - if(threadsize <= 1) - { - for(std::size_t i = 0; i < n; i++) - f(i); - } - else - { - std::vector threads(threadsize); - const std::size_t grainsize = std::ceil(static_cast(n) / threads.size()); - - std::size_t work = 0; - std::generate(threads.begin(), - threads.end(), - std::bind(thread_factory{}, std::ref(work), n, grainsize, f)); - assert(work >= n); - } -} - -template -void par_for(std::size_t n, std::size_t min_grain, F f) -{ - const auto threadsize = - std::min(std::thread::hardware_concurrency(), n / min_grain); - par_for_impl(n, threadsize, f); -} - -struct min_grain -{ - std::size_t n = 0; -}; - -template -void par_for(std::size_t n, min_grain mg, F f) -{ - const auto threadsize = std::min(std::thread::hardware_concurrency(), n / mg.n); - par_for_impl(n, threadsize, f); -} - -template -void par_for(std::size_t n, F f) -{ - par_for(n, min_grain{8}, f); -} - -struct max_threads -{ - std::size_t n = 0; -}; - -template -void par_for(std::size_t n, max_threads mt, F f) -{ - const auto threadsize = std::min(std::thread::hardware_concurrency(), mt.n); - par_for_impl(n, std::min(threadsize, n), f); -} - -template -void par_for_strided(std::size_t n, max_threads mt, F f) -{ - auto threadsize = std::min(std::thread::hardware_concurrency(), mt.n); - par_for_impl(threadsize, threadsize, [&](auto start) { - for(std::size_t i = start; i < n; i += threadsize) - { - f(i); - } - }); -} - -} // namespace miopen - +#include #endif diff --git a/projects/miopen/src/include/miopen/rank.hpp b/projects/miopen/src/include/miopen/rank.hpp index 013ec6e7f7f4..1756782673ad 100644 --- a/projects/miopen/src/include/miopen/rank.hpp +++ b/projects/miopen/src/include/miopen/rank.hpp @@ -23,20 +23,8 @@ * SOFTWARE. * *******************************************************************************/ +// Forwarding header -- implementation moved to common_utils. #ifndef GUARD_MIOPEN_RANK_HPP #define GUARD_MIOPEN_RANK_HPP - -namespace miopen { - -template -struct rank : rank -{ -}; - -template <> -struct rank<0> -{ -}; -} // namespace miopen - +#include #endif diff --git a/projects/miopen/src/include/miopen/returns.hpp b/projects/miopen/src/include/miopen/returns.hpp index 4fdb1db18b87..dd0873cfb2b3 100644 --- a/projects/miopen/src/include/miopen/returns.hpp +++ b/projects/miopen/src/include/miopen/returns.hpp @@ -23,16 +23,8 @@ * SOFTWARE. * *******************************************************************************/ - +// Forwarding header -- implementation moved to common_utils. #ifndef GUARD_MIOPEN_RETURNS_HPP #define GUARD_MIOPEN_RETURNS_HPP - -#define MIOPEN_RETURNS(...) \ - ->decltype(__VA_ARGS__) { return __VA_ARGS__; } - -#define MIOPEN_BODY_RETURNS(...) \ - { \ - return __VA_ARGS__; \ - } - +#include #endif diff --git a/projects/miopen/src/include/miopen/type_name.hpp b/projects/miopen/src/include/miopen/type_name.hpp index ac7fd2ff6017..d2cce63d3d32 100644 --- a/projects/miopen/src/include/miopen/type_name.hpp +++ b/projects/miopen/src/include/miopen/type_name.hpp @@ -23,117 +23,8 @@ * SOFTWARE. * *******************************************************************************/ - +// Forwarding header -- implementation moved to common_utils. #ifndef GUARD_TYPE_NAME_HPP #define GUARD_TYPE_NAME_HPP - -#include -#include -#if defined(_MSC_VER) && !defined(__clang__) && !defined(__GNUC__) -#include -#endif - -namespace miopen { - -template -constexpr std::string_view type_name() -{ -#if defined(__clang__) || defined(__GNUC__) - // clang or gcc - constexpr auto full_name = std::string_view{__PRETTY_FUNCTION__}; -#elif defined(_MSC_VER) - // msvc - constexpr auto full_name = std::string_view{__FUNCSIG__}; -#endif - - // The substring with the data type name is located within the original string, between the - // prefix and the suffix, with the prefix always not at the beginning of the string and the - // suffix always at the end of the string. -#if defined(__clang__) - // clang - constexpr auto prefix = std::string_view{"[T = "}; - constexpr auto suffix = std::string_view{"]"}; -#elif defined(__GNUC__) - // gcc - constexpr auto prefix = std::string_view{"[with T = "}; - constexpr auto suffix = std::string_view{"; std::string_view = std::basic_string_view]"}; -#elif defined(_MSC_VER) - // msvc - constexpr auto prefix = std::string_view{"type_name<"}; - constexpr auto suffix = std::string_view{">(void)"}; -#endif - - constexpr auto prefix_pos = full_name.find(prefix); - static_assert(prefix_pos != std::string_view::npos); - - constexpr auto suffix_pos = full_name.rfind(suffix); - static_assert(suffix_pos != std::string_view::npos); - static_assert(suffix_pos == full_name.size() - suffix.size()); - - constexpr auto pos = prefix_pos + prefix.size(); - static_assert(pos < suffix_pos); - constexpr auto count = suffix_pos - pos; - - constexpr auto name = full_name.substr(pos, count); - -#if defined(__clang__) || defined(__GNUC__) - // clang or gcc - return name; -#elif defined(_MSC_VER) - // msvc - if constexpr(std::is_compound_v) - { - // For compound data types, the string contains the keyword 'class/struct/union/enum' before - // the data type name, separated by a space. - constexpr auto sep = std::string_view{" "}; - constexpr auto sep_pos = name.find(sep); - static_assert(sep_pos != std::string_view::npos); - static_assert(sep_pos != 0); // must not be at the 0 position - - constexpr auto name_pos = sep_pos + sep.size(); - constexpr auto tname = name.substr(name_pos); - static_assert(tname.size() > 0); - - return tname; - } - else - { - return name; - } -#endif -} - -template -constexpr std::string_view type_name_bare() -{ - constexpr auto name = type_name(); - constexpr auto pos = name.rfind(':'); - if constexpr(pos == std::string_view::npos) - { - constexpr auto result = name; - return result; - } - else - { - constexpr auto bare_name = name.substr(pos + 1); - static_assert(bare_name.size() > 0); - return bare_name; - } -} - -template -const std::string& get_type_name() -{ - static const auto ret = std::string(type_name()); - return ret; -} - -template -const std::string& get_type_name(const T&) -{ - return miopen::get_type_name(); -} - -} // namespace miopen - +#include #endif diff --git a/projects/miopen/test/CMakeLists.txt b/projects/miopen/test/CMakeLists.txt index 57601d45ceaf..bef91d0ea871 100755 --- a/projects/miopen/test/CMakeLists.txt +++ b/projects/miopen/test/CMakeLists.txt @@ -414,9 +414,9 @@ function(add_test_executable TEST_NAME) endif() # MIOpen_with_plugins ensures CK plugin .so's are built alongside the test if(NOT MIOPEN_EMBED_DB STREQUAL "") - target_link_libraries(${TEST_NAME} MIOpen_with_plugins miopen_data) + target_link_libraries(${TEST_NAME} MIOpen_with_plugins miopen_data miopen_common_utils) else() - target_link_libraries(${TEST_NAME} MIOpen_with_plugins) + target_link_libraries(${TEST_NAME} MIOpen_with_plugins miopen_common_utils) endif() target_include_directories(${TEST_NAME} PRIVATE ../src/kernels) if(WIN32) From 69c619ea64d55fc0660260b8755ae97b7d305f81 Mon Sep 17 00:00:00 2001 From: Brad Pepers Date: Sat, 9 May 2026 07:14:21 -0600 Subject: [PATCH 02/11] Move bfloat16, stringutils, reduce_common, random to common_utils Continue populating the common_utils shared utility library: - bfloat16.hpp: Removed miopen/config.h dependency, MIOPEN_USE_RNE_BFLOAT16 now provided via CMake compile definition on the INTERFACE target - stringutils.hpp: Replaced miopen/errors.hpp dependency with std::runtime_error, updated algorithm include to common_utils path - reduce_common.hpp: Updated bfloat16 include to common_utils path - random.hpp: Moved from driver/ to common_utils/ to break the circular dependency between driver/ and test/. Note: still depends on miopen/env.hpp (to be cleaned up in Phase 2) Forwarding headers left at all original locations. Co-Authored-By: Claude Sonnet 4 --- projects/miopen/common_utils/CMakeLists.txt | 9 + .../include/common_utils/bfloat16.hpp | 179 ++++++++++++++++++ .../include/common_utils/random.hpp | 159 ++++++++++++++++ .../include/common_utils/reduce_common.hpp | 66 +++++++ .../include/common_utils/stringutils.hpp | 165 ++++++++++++++++ projects/miopen/driver/random.hpp | 160 +--------------- .../miopen/src/include/miopen/bfloat16.hpp | 178 +---------------- .../src/include/miopen/reduce_common.hpp | 65 +------ .../miopen/src/include/miopen/stringutils.hpp | 166 +--------------- 9 files changed, 588 insertions(+), 559 deletions(-) create mode 100644 projects/miopen/common_utils/include/common_utils/bfloat16.hpp create mode 100644 projects/miopen/common_utils/include/common_utils/random.hpp create mode 100644 projects/miopen/common_utils/include/common_utils/reduce_common.hpp create mode 100644 projects/miopen/common_utils/include/common_utils/stringutils.hpp diff --git a/projects/miopen/common_utils/CMakeLists.txt b/projects/miopen/common_utils/CMakeLists.txt index c0f4620a3439..1afb185255c9 100644 --- a/projects/miopen/common_utils/CMakeLists.txt +++ b/projects/miopen/common_utils/CMakeLists.txt @@ -33,3 +33,12 @@ add_library(MIOpen::common_utils ALIAS miopen_common_utils) target_include_directories(miopen_common_utils INTERFACE $ ) + +# bfloat16.hpp needs to know the rounding mode. +# This option is also defined in src/CMakeLists.txt for backward compatibility. +option(MIOPEN_USE_RNE_BFLOAT16 "Sets rounding scheme for bfloat16 type" ON) +if(MIOPEN_USE_RNE_BFLOAT16) + target_compile_definitions(miopen_common_utils INTERFACE MIOPEN_USE_RNE_BFLOAT16=1) +else() + target_compile_definitions(miopen_common_utils INTERFACE MIOPEN_USE_RNE_BFLOAT16=0) +endif() diff --git a/projects/miopen/common_utils/include/common_utils/bfloat16.hpp b/projects/miopen/common_utils/include/common_utils/bfloat16.hpp new file mode 100644 index 000000000000..71fe70bbd3c7 --- /dev/null +++ b/projects/miopen/common_utils/include/common_utils/bfloat16.hpp @@ -0,0 +1,179 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + +#ifndef BFLOAT16_H_ +#define BFLOAT16_H_ + +#include +// MIOPEN_USE_RNE_BFLOAT16 is provided via CMake compile definitions. + +class bfloat16 +{ +public: + bfloat16() : data_{0} {} + explicit bfloat16(float rhs) + { + union + { + float float_st; + std::uint32_t bf16_st; + } bits_st = {rhs}; + + // BF16 round and NaN preservation code matches + // https://github.com/ROCm/rocBLAS/blob/develop/library/include/rocblas_bfloat16.h + if((~bits_st.bf16_st & 0x7f800000) == 0) // Inf or NaN + { + // When all of the exponent bits are 1, the value is Inf or NaN. + // Inf is indicated by a zero mantissa. NaN is indicated by any nonzero + // mantissa bit. Quiet NaN is indicated by the most significant mantissa + // bit being 1. Signaling NaN is indicated by the most significant + // mantissa bit being 0 but some other bit(s) being 1. If any of the + // lower 16 bits of the mantissa are 1, we set the least significant bit + // of the bfloat16 mantissa, in order to preserve signaling NaN in case + // the bloat16's mantissa bits are all 0. + if((bits_st.bf16_st & 0xffff) != 0) + { + bits_st.bf16_st |= 0x10000; // Preserve signaling NaN + } + } + else + { +#if MIOPEN_USE_RNE_BFLOAT16 == 1 + // When the exponent bits are not all 1s, then the value is zero, normal, + // or subnormal. We round the bfloat16 mantissa up by adding 0x7FFF, plus + // 1 if the least significant bit of the bfloat16 mantissa is 1 (odd). + // This causes the bfloat16's mantissa to be incremented by 1 if the 16 + // least significant bits of the float mantissa are greater than 0x8000, + // or if they are equal to 0x8000 and the least significant bit of the + // bfloat16 mantissa is 1 (odd). This causes it to be rounded to even when + // the lower 16 bits are exactly 0x8000. If the bfloat16 mantissa already + // has the value 0x7f, then incrementing it causes it to become 0x00 and + // the exponent is incremented by one, which is the next higher FP value + // to the unrounded bfloat16 value. When the bfloat16 value is subnormal + // with an exponent of 0x00 and a mantissa of 0x7F, it may be rounded up + // to a normal value with an exponent of 0x01 and a mantissa of 0x00. + // When the bfloat16 value has an exponent of 0xFE and a mantissa of 0x7F, + // incrementing it causes it to become an exponent of 0xFF and a mantissa + // of 0x00, which is Inf, the next higher value to the unrounded value. + bits_st.bf16_st += + (0x7fff + ((bits_st.bf16_st >> 16) & 1)); // Round to nearest, round to even +#else // truncation +// do nothing +#endif + } + data_ = bits_st.bf16_st >> 16; + } + operator float() const + { + union + { + std::uint32_t bf16_st; + float float_st; + } bits_st = {data_}; + + bits_st.bf16_st = bits_st.bf16_st << 16; + return bits_st.float_st; + } + + bfloat16 operator-() const { return bfloat16(-static_cast(*this)); } + bfloat16 operator+() const { return *this; } + + bfloat16& operator=(const float rhs) + { + *this = bfloat16(rhs); + return *this; + } + bfloat16& operator+=(bfloat16 rhs) + { + *this = bfloat16(static_cast(*this) + static_cast(rhs)); + return *this; + } + + bfloat16& operator+=(float rhs) + { + *this = bfloat16(static_cast(*this) + rhs); + return *this; + } + + bfloat16& operator-=(bfloat16 rhs) + { + *this += -rhs; + return *this; + } + bfloat16& operator*=(bfloat16 rhs) + { + *this = bfloat16(static_cast(*this) * static_cast(rhs)); + return *this; + } + bfloat16& operator*=(float rhs) + { + *this = bfloat16(static_cast(*this) * rhs); + return *this; + } + + bfloat16& operator/=(bfloat16 rhs) + { + *this = bfloat16(static_cast(*this) / static_cast(rhs)); + return *this; + } + bool operator<(bfloat16 rhs) const + { + return static_cast(*this) < static_cast(rhs); + } + bool operator==(bfloat16 rhs) const { return std::equal_to()(*this, rhs); } + + static constexpr bfloat16 generate(uint16_t val) { return bfloat16{val, true}; } + +private: + constexpr bfloat16(std::uint16_t val, bool) : data_{val} {} + + std::uint16_t data_; +}; + +inline bfloat16 operator+(bfloat16 a, const bfloat16& b) +{ + a += b; + return a; +} + +inline bfloat16 operator-(bfloat16 a, const bfloat16& b) +{ + a -= b; + return a; +} + +inline bfloat16 operator*(bfloat16 a, const bfloat16& b) +{ + a *= b; + return a; +} + +inline bfloat16 operator/(bfloat16 a, const bfloat16& b) +{ + a /= b; + return a; +} + +namespace std { +template <> +class numeric_limits +{ +public: + static constexpr bool is_specialized = true; + static constexpr bfloat16 min() noexcept { return bfloat16::generate(0x0080); } // 0x1.00p-126 + static constexpr bfloat16 max() noexcept { return bfloat16::generate(0x7F7F); } + static constexpr bfloat16 lowest() noexcept { return bfloat16::generate(0xFF7F); } + static constexpr bfloat16 epsilon() noexcept { return bfloat16::generate(0x3C00); } + static constexpr bfloat16 infinity() noexcept { return bfloat16::generate(0x7F80); } + static constexpr bfloat16 quiet_NaN() noexcept { return bfloat16::generate(0x7FC0); } // qnan(0) + static constexpr bfloat16 signaling_NaN() noexcept + { + return bfloat16::generate(0x7F81); // snan(1) + } + static constexpr bfloat16 denorm_min() noexcept + { + return bfloat16::generate(0x0001); // 0x0.02p-126 + } +}; +} // namespace std +#endif diff --git a/projects/miopen/common_utils/include/common_utils/random.hpp b/projects/miopen/common_utils/include/common_utils/random.hpp new file mode 100644 index 000000000000..f6f8d85c4ce4 --- /dev/null +++ b/projects/miopen/common_utils/include/common_utils/random.hpp @@ -0,0 +1,159 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2025 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#ifndef GUARD_RANDOM_GEN_ +#define GUARD_RANDOM_GEN_ + +#include + +#include +#include +#include + +MIOPEN_DECLARE_ENV_VAR_UINT64(MIOPEN_DEBUG_DRIVER_PRNG_SEED, 12345678) + +namespace env = miopen::env; + +namespace prng { +namespace details { +using glibc_gen = std::linear_congruential_engine; + +inline std::random_device::result_type get_default_seed() +{ + static std::random_device::result_type seed{[] { + auto external_seed = env::value(MIOPEN_DEBUG_DRIVER_PRNG_SEED); + + auto seed_ = external_seed == 0 + ? std::random_device{}() + : static_cast(external_seed); + std::cout << "PRNG seed: " << seed_ << "\n"; + return seed_; + }()}; + return seed; +} + +inline glibc_gen& get_prng() +{ + static thread_local glibc_gen gen{get_default_seed()}; + return gen; +} + +template +struct has_digits : std::false_type +{ +}; + +template +struct has_digits::digits)>> : std::true_type +{ +}; + +} // namespace details + +inline void reset_seed(std::random_device::result_type seed = 0) +{ + details::get_prng().seed(seed + details::get_default_seed()); +} + +// similar to std::generate_canonical, but simpler and faster +template +inline T gen_canonical() +{ + if constexpr(std::is_floating_point_v) // native fp + { + static constexpr T range = + static_cast(1) / + static_cast(details::glibc_gen::max() - details::glibc_gen::min() + 1); + return range * static_cast(details::get_prng()() - details::glibc_gen::min()); + } + else if constexpr(std::is_integral_v) + { + auto val = details::get_prng()(); + return static_cast(((val >> 4) + (val >> 16)) & 0x1); + } + else + { + return static_cast(gen_canonical()); + } +} + +template +inline T gen_0_to_B(T B) +{ + if constexpr(std::is_floating_point_v) // native fp + { + return gen_canonical() * B; + } + else if constexpr(std::is_integral_v) + { + // can only generate 27bit range, so it may not be suitable + // for huge 64 bit ranges, but we do not expect such ranges + return static_cast((details::get_prng()() >> 4) % B); + } + else // half/bfloat/etc + { + return static_cast(gen_0_to_B(static_cast(B))); + } +} + +template +inline T gen_A_to_B(T A, T B) +{ + assert(B > A); + return gen_0_to_B(B - A) + A; +} + +template +inline T gen_off_range(T offset, T range) +{ + static_assert(std::is_integral_v); + return prng::gen_0_to_B(range) + offset; +} + +template +inline T gen_subnorm() +{ + T denorm_val = static_cast(0); + if constexpr(!std::is_integral_v && !std::is_same_v && + std::is_trivially_copyable::value && details::has_digits::value) + { + using BitType = std::conditional_t>; + static_assert(sizeof(T) == sizeof(BitType)); + + // -1 because ::digits counts the first implicit digit + static constexpr auto mantissa_bits = std::numeric_limits::digits - 1; + + BitType denorm_bits = static_cast(gen_0_to_B(1 << mantissa_bits)); + denorm_bits |= Signed ? (gen_canonical() << (sizeof(T) * 8 - 1)) : 0; + + // the proper way to do a type punning + std::memcpy(&denorm_val, &denorm_bits, sizeof(T)); + } + return denorm_val; +} +} // namespace prng +#endif // GUARD_RANDOM_GEN_ diff --git a/projects/miopen/common_utils/include/common_utils/reduce_common.hpp b/projects/miopen/common_utils/include/common_utils/reduce_common.hpp new file mode 100644 index 000000000000..74ce541f694b --- /dev/null +++ b/projects/miopen/common_utils/include/common_utils/reduce_common.hpp @@ -0,0 +1,66 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#ifndef GUARD_MIOPEN_REDUCE_COMMON_HPP +#define GUARD_MIOPEN_REDUCE_COMMON_HPP + +#include +#include + +namespace reduce { + +template +static inline Tdst convert_type(Tsrc x) +{ + return static_cast(x); +} + +template <> +inline float convert_type(half_float::half x) +{ + return half_float::half_cast(x); +}; + +template <> +inline half_float::half convert_type(float x) +{ + return half_float::half_cast(x); +}; + +template <> +inline float convert_type(bfloat16 x) +{ + return float(x); +}; + +template <> +inline bfloat16 convert_type(float x) +{ + return bfloat16(x); +}; + +}; // end of namespace reduce + +#endif diff --git a/projects/miopen/common_utils/include/common_utils/stringutils.hpp b/projects/miopen/common_utils/include/common_utils/stringutils.hpp new file mode 100644 index 000000000000..19d579014c73 --- /dev/null +++ b/projects/miopen/common_utils/include/common_utils/stringutils.hpp @@ -0,0 +1,165 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2017 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#ifndef GUARD_MIOPEN_STRINGUTILS_HPP +#define GUARD_MIOPEN_STRINGUTILS_HPP + +#include +#include +#include +#include +#include +#include +#include +#include + +#define MIOPEN_STRINGIZE_1(...) #__VA_ARGS__ +#define MIOPEN_STRINGIZE(...) MIOPEN_STRINGIZE_1(__VA_ARGS__) + +namespace miopen { + +inline std::string +ReplaceString(const std::string& in, const std::string& search, const std::string& replace) +{ + size_t pos = 0; + std::string subject(in); + while((pos = subject.find(search, pos)) != std::string::npos) + { + subject.replace(pos, search.length(), replace); + pos += replace.length(); + } + return subject; +} + +inline bool EndsWith(const std::string& value, const std::string& suffix) +{ + if(suffix.size() > value.size()) + return false; + else + return std::equal(suffix.rbegin(), suffix.rend(), value.rbegin()); +} + +template +inline std::string JoinStrings(Strings strings, std::string delim) +{ + auto it = strings.begin(); + if(it == strings.end()) + return ""; + + auto nit = std::next(it); + return std::accumulate( + nit, strings.end(), *it, [&](std::string x, std::string y) { return x + delim + y; }); +} + +template +static inline std::string TransformString(std::string s, F f) +{ + std::transform(s.begin(), s.end(), s.begin(), f); + return s; +} + +inline std::string ToUpper(std::string s) { return TransformString(std::move(s), ::toupper); } + +inline bool StartsWith(const std::string& value, const std::string& prefix) +{ + if(prefix.size() > value.size()) + return false; + else + return std::equal(prefix.begin(), prefix.end(), value.begin()); +} + +inline std::string RemovePrefix(std::string s, std::string prefix) +{ + if(StartsWith(s, prefix)) + return s.substr(prefix.length()); + else + return s; +} + +inline std::vector SplitSpaceSeparated(const std::string& in) +{ + std::istringstream ss(in); + const std::istream_iterator begin(ss), end; + return {begin, end}; +} + +inline std::vector SplitSpaceSeparated(const std::vector& in) +{ + std::vector rv; + for(const auto& item : in) + { + if(item.find(' ') != std::string::npos) + { + const auto splitted = SplitSpaceSeparated(item); + std::copy(splitted.begin(), splitted.end(), std::back_inserter(rv)); + } + else + { + rv.emplace_back(item); + } + } + return rv; +} + +inline std::vector SplitSpaceSeparated(const std::string& in, + const std::vector& dontSplitAfter) +{ + std::vector rv; + std::istringstream ss(in); + std::string s; + while(ss >> s) + { + if(any_of(dontSplitAfter, [&](const auto& dont) { return dont == s; })) + { + std::string s2; + if(ss >> s2) + { + s += std::string(" ").append(s2); // Exactly one space is important. + rv.push_back(s); + continue; + } + throw std::runtime_error("Error parsing string: '" + in + '\''); + } + rv.push_back(s); + } + return rv; +} + +inline std::vector SplitDelim(const std::string& in, const char delim) +{ + std::vector rv; + std::string token; + std::istringstream ss(in); + + while(std::getline(ss, token, delim)) + { + rv.push_back(token); + } + return rv; +} + +} // namespace miopen + +#endif // GUARD_MIOPEN_STRINGUTILS_HPP diff --git a/projects/miopen/driver/random.hpp b/projects/miopen/driver/random.hpp index f6f8d85c4ce4..81e630411c67 100644 --- a/projects/miopen/driver/random.hpp +++ b/projects/miopen/driver/random.hpp @@ -1,159 +1,5 @@ -/******************************************************************************* - * - * MIT License - * - * Copyright (c) 2025 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - *******************************************************************************/ +// Forwarding header — implementation moved to common_utils. #ifndef GUARD_RANDOM_GEN_ #define GUARD_RANDOM_GEN_ - -#include - -#include -#include -#include - -MIOPEN_DECLARE_ENV_VAR_UINT64(MIOPEN_DEBUG_DRIVER_PRNG_SEED, 12345678) - -namespace env = miopen::env; - -namespace prng { -namespace details { -using glibc_gen = std::linear_congruential_engine; - -inline std::random_device::result_type get_default_seed() -{ - static std::random_device::result_type seed{[] { - auto external_seed = env::value(MIOPEN_DEBUG_DRIVER_PRNG_SEED); - - auto seed_ = external_seed == 0 - ? std::random_device{}() - : static_cast(external_seed); - std::cout << "PRNG seed: " << seed_ << "\n"; - return seed_; - }()}; - return seed; -} - -inline glibc_gen& get_prng() -{ - static thread_local glibc_gen gen{get_default_seed()}; - return gen; -} - -template -struct has_digits : std::false_type -{ -}; - -template -struct has_digits::digits)>> : std::true_type -{ -}; - -} // namespace details - -inline void reset_seed(std::random_device::result_type seed = 0) -{ - details::get_prng().seed(seed + details::get_default_seed()); -} - -// similar to std::generate_canonical, but simpler and faster -template -inline T gen_canonical() -{ - if constexpr(std::is_floating_point_v) // native fp - { - static constexpr T range = - static_cast(1) / - static_cast(details::glibc_gen::max() - details::glibc_gen::min() + 1); - return range * static_cast(details::get_prng()() - details::glibc_gen::min()); - } - else if constexpr(std::is_integral_v) - { - auto val = details::get_prng()(); - return static_cast(((val >> 4) + (val >> 16)) & 0x1); - } - else - { - return static_cast(gen_canonical()); - } -} - -template -inline T gen_0_to_B(T B) -{ - if constexpr(std::is_floating_point_v) // native fp - { - return gen_canonical() * B; - } - else if constexpr(std::is_integral_v) - { - // can only generate 27bit range, so it may not be suitable - // for huge 64 bit ranges, but we do not expect such ranges - return static_cast((details::get_prng()() >> 4) % B); - } - else // half/bfloat/etc - { - return static_cast(gen_0_to_B(static_cast(B))); - } -} - -template -inline T gen_A_to_B(T A, T B) -{ - assert(B > A); - return gen_0_to_B(B - A) + A; -} - -template -inline T gen_off_range(T offset, T range) -{ - static_assert(std::is_integral_v); - return prng::gen_0_to_B(range) + offset; -} - -template -inline T gen_subnorm() -{ - T denorm_val = static_cast(0); - if constexpr(!std::is_integral_v && !std::is_same_v && - std::is_trivially_copyable::value && details::has_digits::value) - { - using BitType = std::conditional_t>; - static_assert(sizeof(T) == sizeof(BitType)); - - // -1 because ::digits counts the first implicit digit - static constexpr auto mantissa_bits = std::numeric_limits::digits - 1; - - BitType denorm_bits = static_cast(gen_0_to_B(1 << mantissa_bits)); - denorm_bits |= Signed ? (gen_canonical() << (sizeof(T) * 8 - 1)) : 0; - - // the proper way to do a type punning - std::memcpy(&denorm_val, &denorm_bits, sizeof(T)); - } - return denorm_val; -} -} // namespace prng -#endif // GUARD_RANDOM_GEN_ +#include +#endif diff --git a/projects/miopen/src/include/miopen/bfloat16.hpp b/projects/miopen/src/include/miopen/bfloat16.hpp index 3e3a184a72d1..fc3880629c68 100644 --- a/projects/miopen/src/include/miopen/bfloat16.hpp +++ b/projects/miopen/src/include/miopen/bfloat16.hpp @@ -1,179 +1,5 @@ -// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. -// SPDX-License-Identifier: MIT - +// Forwarding header — implementation moved to common_utils. #ifndef BFLOAT16_H_ #define BFLOAT16_H_ - -#include -#include - -class bfloat16 -{ -public: - bfloat16() : data_{0} {} - explicit bfloat16(float rhs) - { - union - { - float float_st; - std::uint32_t bf16_st; - } bits_st = {rhs}; - - // BF16 round and NaN preservation code matches - // https://github.com/ROCm/rocBLAS/blob/develop/library/include/rocblas_bfloat16.h - if((~bits_st.bf16_st & 0x7f800000) == 0) // Inf or NaN - { - // When all of the exponent bits are 1, the value is Inf or NaN. - // Inf is indicated by a zero mantissa. NaN is indicated by any nonzero - // mantissa bit. Quiet NaN is indicated by the most significant mantissa - // bit being 1. Signaling NaN is indicated by the most significant - // mantissa bit being 0 but some other bit(s) being 1. If any of the - // lower 16 bits of the mantissa are 1, we set the least significant bit - // of the bfloat16 mantissa, in order to preserve signaling NaN in case - // the bloat16's mantissa bits are all 0. - if((bits_st.bf16_st & 0xffff) != 0) - { - bits_st.bf16_st |= 0x10000; // Preserve signaling NaN - } - } - else - { -#if MIOPEN_USE_RNE_BFLOAT16 == 1 - // When the exponent bits are not all 1s, then the value is zero, normal, - // or subnormal. We round the bfloat16 mantissa up by adding 0x7FFF, plus - // 1 if the least significant bit of the bfloat16 mantissa is 1 (odd). - // This causes the bfloat16's mantissa to be incremented by 1 if the 16 - // least significant bits of the float mantissa are greater than 0x8000, - // or if they are equal to 0x8000 and the least significant bit of the - // bfloat16 mantissa is 1 (odd). This causes it to be rounded to even when - // the lower 16 bits are exactly 0x8000. If the bfloat16 mantissa already - // has the value 0x7f, then incrementing it causes it to become 0x00 and - // the exponent is incremented by one, which is the next higher FP value - // to the unrounded bfloat16 value. When the bfloat16 value is subnormal - // with an exponent of 0x00 and a mantissa of 0x7F, it may be rounded up - // to a normal value with an exponent of 0x01 and a mantissa of 0x00. - // When the bfloat16 value has an exponent of 0xFE and a mantissa of 0x7F, - // incrementing it causes it to become an exponent of 0xFF and a mantissa - // of 0x00, which is Inf, the next higher value to the unrounded value. - bits_st.bf16_st += - (0x7fff + ((bits_st.bf16_st >> 16) & 1)); // Round to nearest, round to even -#else // truncation -// do nothing -#endif - } - data_ = bits_st.bf16_st >> 16; - } - operator float() const - { - union - { - std::uint32_t bf16_st; - float float_st; - } bits_st = {data_}; - - bits_st.bf16_st = bits_st.bf16_st << 16; - return bits_st.float_st; - } - - bfloat16 operator-() const { return bfloat16(-static_cast(*this)); } - bfloat16 operator+() const { return *this; } - - bfloat16& operator=(const float rhs) - { - *this = bfloat16(rhs); - return *this; - } - bfloat16& operator+=(bfloat16 rhs) - { - *this = bfloat16(static_cast(*this) + static_cast(rhs)); - return *this; - } - - bfloat16& operator+=(float rhs) - { - *this = bfloat16(static_cast(*this) + rhs); - return *this; - } - - bfloat16& operator-=(bfloat16 rhs) - { - *this += -rhs; - return *this; - } - bfloat16& operator*=(bfloat16 rhs) - { - *this = bfloat16(static_cast(*this) * static_cast(rhs)); - return *this; - } - bfloat16& operator*=(float rhs) - { - *this = bfloat16(static_cast(*this) * rhs); - return *this; - } - - bfloat16& operator/=(bfloat16 rhs) - { - *this = bfloat16(static_cast(*this) / static_cast(rhs)); - return *this; - } - bool operator<(bfloat16 rhs) const - { - return static_cast(*this) < static_cast(rhs); - } - bool operator==(bfloat16 rhs) const { return std::equal_to()(*this, rhs); } - - static constexpr bfloat16 generate(uint16_t val) { return bfloat16{val, true}; } - -private: - constexpr bfloat16(std::uint16_t val, bool) : data_{val} {} - - std::uint16_t data_; -}; - -inline bfloat16 operator+(bfloat16 a, const bfloat16& b) -{ - a += b; - return a; -} - -inline bfloat16 operator-(bfloat16 a, const bfloat16& b) -{ - a -= b; - return a; -} - -inline bfloat16 operator*(bfloat16 a, const bfloat16& b) -{ - a *= b; - return a; -} - -inline bfloat16 operator/(bfloat16 a, const bfloat16& b) -{ - a /= b; - return a; -} - -namespace std { -template <> -class numeric_limits -{ -public: - static constexpr bool is_specialized = true; - static constexpr bfloat16 min() noexcept { return bfloat16::generate(0x0080); } // 0x1.00p-126 - static constexpr bfloat16 max() noexcept { return bfloat16::generate(0x7F7F); } - static constexpr bfloat16 lowest() noexcept { return bfloat16::generate(0xFF7F); } - static constexpr bfloat16 epsilon() noexcept { return bfloat16::generate(0x3C00); } - static constexpr bfloat16 infinity() noexcept { return bfloat16::generate(0x7F80); } - static constexpr bfloat16 quiet_NaN() noexcept { return bfloat16::generate(0x7FC0); } // qnan(0) - static constexpr bfloat16 signaling_NaN() noexcept - { - return bfloat16::generate(0x7F81); // snan(1) - } - static constexpr bfloat16 denorm_min() noexcept - { - return bfloat16::generate(0x0001); // 0x0.02p-126 - } -}; -} // namespace std +#include #endif diff --git a/projects/miopen/src/include/miopen/reduce_common.hpp b/projects/miopen/src/include/miopen/reduce_common.hpp index 37b92e727d92..f1bd0b38e320 100644 --- a/projects/miopen/src/include/miopen/reduce_common.hpp +++ b/projects/miopen/src/include/miopen/reduce_common.hpp @@ -1,66 +1,5 @@ -/******************************************************************************* - * - * MIT License - * - * Copyright (c) 2020 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - *******************************************************************************/ +// Forwarding header — implementation moved to common_utils. #ifndef GUARD_MIOPEN_REDUCE_COMMON_HPP #define GUARD_MIOPEN_REDUCE_COMMON_HPP - -#include -#include - -namespace reduce { - -template -static inline Tdst convert_type(Tsrc x) -{ - return static_cast(x); -} - -template <> -inline float convert_type(half_float::half x) -{ - return half_float::half_cast(x); -}; - -template <> -inline half_float::half convert_type(float x) -{ - return half_float::half_cast(x); -}; - -template <> -inline float convert_type(bfloat16 x) -{ - return float(x); -}; - -template <> -inline bfloat16 convert_type(float x) -{ - return bfloat16(x); -}; - -}; // end of namespace reduce - +#include #endif diff --git a/projects/miopen/src/include/miopen/stringutils.hpp b/projects/miopen/src/include/miopen/stringutils.hpp index 5a412416d666..38f52efd1cf6 100644 --- a/projects/miopen/src/include/miopen/stringutils.hpp +++ b/projects/miopen/src/include/miopen/stringutils.hpp @@ -1,165 +1,5 @@ -/******************************************************************************* - * - * MIT License - * - * Copyright (c) 2017 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - *******************************************************************************/ +// Forwarding header — implementation moved to common_utils. #ifndef GUARD_MIOPEN_STRINGUTILS_HPP #define GUARD_MIOPEN_STRINGUTILS_HPP - -#include -#include -#include -#include -#include -#include -#include -#include - -#define MIOPEN_STRINGIZE_1(...) #__VA_ARGS__ -#define MIOPEN_STRINGIZE(...) MIOPEN_STRINGIZE_1(__VA_ARGS__) - -namespace miopen { - -inline std::string -ReplaceString(const std::string& in, const std::string& search, const std::string& replace) -{ - size_t pos = 0; - std::string subject(in); - while((pos = subject.find(search, pos)) != std::string::npos) - { - subject.replace(pos, search.length(), replace); - pos += replace.length(); - } - return subject; -} - -inline bool EndsWith(const std::string& value, const std::string& suffix) -{ - if(suffix.size() > value.size()) - return false; - else - return std::equal(suffix.rbegin(), suffix.rend(), value.rbegin()); -} - -template -inline std::string JoinStrings(Strings strings, std::string delim) -{ - auto it = strings.begin(); - if(it == strings.end()) - return ""; - - auto nit = std::next(it); - return std::accumulate( - nit, strings.end(), *it, [&](std::string x, std::string y) { return x + delim + y; }); -} - -template -static inline std::string TransformString(std::string s, F f) -{ - std::transform(s.begin(), s.end(), s.begin(), f); - return s; -} - -inline std::string ToUpper(std::string s) { return TransformString(std::move(s), ::toupper); } - -inline bool StartsWith(const std::string& value, const std::string& prefix) -{ - if(prefix.size() > value.size()) - return false; - else - return std::equal(prefix.begin(), prefix.end(), value.begin()); -} - -inline std::string RemovePrefix(std::string s, std::string prefix) -{ - if(StartsWith(s, prefix)) - return s.substr(prefix.length()); - else - return s; -} - -inline std::vector SplitSpaceSeparated(const std::string& in) -{ - std::istringstream ss(in); - const std::istream_iterator begin(ss), end; - return {begin, end}; -} - -inline std::vector SplitSpaceSeparated(const std::vector& in) -{ - std::vector rv; - for(const auto& item : in) - { - if(item.find(' ') != std::string::npos) - { - const auto splitted = SplitSpaceSeparated(item); - std::copy(splitted.begin(), splitted.end(), std::back_inserter(rv)); - } - else - { - rv.emplace_back(item); - } - } - return rv; -} - -inline std::vector SplitSpaceSeparated(const std::string& in, - const std::vector& dontSplitAfter) -{ - std::vector rv; - std::istringstream ss(in); - std::string s; - while(ss >> s) - { - if(any_of(dontSplitAfter, [&](const auto& dont) { return dont == s; })) - { - std::string s2; - if(ss >> s2) - { - s += std::string(" ").append(s2); // Exactly one space is important. - rv.push_back(s); - continue; - } - MIOPEN_THROW("Error parsing string: '" + in + '\''); - } - rv.push_back(s); - } - return rv; -} - -inline std::vector SplitDelim(const std::string& in, const char delim) -{ - std::vector rv; - std::string token; - std::istringstream ss(in); - - while(std::getline(ss, token, delim)) - { - rv.push_back(token); - } - return rv; -} - -} // namespace miopen - -#endif // GUARD_MIOPEN_STRINGUTILS_HPP +#include +#endif From 295374db784d40c334be5f4e544e7d803d28666a Mon Sep 17 00:00:00 2001 From: Brad Pepers Date: Sat, 9 May 2026 07:32:18 -0600 Subject: [PATCH 03/11] Create miopen_utils library and move shared test/verification code Move 12 headers from test/ to miopen_utils/include/miopen_utils/: - tensor_holder.hpp, verify.hpp (used by 30+ driver files) - cpu_conv.hpp, cpu_bias.hpp, cpu_layernorm.hpp (CPU reference) - fusionHost.hpp, gemm.hpp, cpu_reduce_util.hpp, rnn_util.hpp - random.hpp (test initializers) - serialize.hpp, network_data.hpp (tensor_holder dependencies) Include cleanup: - Removed unused #include "test.hpp" from cpu_conv.hpp, cpu_bias.hpp - Removed unused #include "get_handle.hpp" from fusionHost.hpp - Updated all internal cross-references to use and paths Updated 35 driver files to include from instead of <../test/>. Forwarding headers left at original test/ locations for backward compatibility with existing test code. Result: driver/ no longer includes from test/, and miopen_utils/ no longer includes from driver/ or test/. Co-Authored-By: Claude Sonnet 4 --- projects/miopen/CMakeLists.txt | 1 + .../miopen/driver/CBAInferFusion_driver.hpp | 6 +- projects/miopen/driver/CMakeLists.txt | 2 +- projects/miopen/driver/adam_driver.hpp | 2 +- .../miopen/driver/addlayernorm_driver.hpp | 4 +- projects/miopen/driver/bn_driver.hpp | 6 +- projects/miopen/driver/cat_driver.hpp | 4 +- projects/miopen/driver/conv_driver.hpp | 8 +- projects/miopen/driver/conv_verify.hpp | 2 +- projects/miopen/driver/ctc_driver.hpp | 2 +- projects/miopen/driver/driver.hpp | 2 +- projects/miopen/driver/dropout_driver.hpp | 2 +- projects/miopen/driver/gemm_driver.hpp | 2 +- projects/miopen/driver/getitem_driver.hpp | 4 +- projects/miopen/driver/glu_driver.hpp | 2 +- projects/miopen/driver/groupnorm_driver.hpp | 4 +- projects/miopen/driver/gru_verify_gemm.hpp | 2 +- projects/miopen/driver/kthvalue_driver.hpp | 4 +- projects/miopen/driver/layernorm_driver.hpp | 6 +- projects/miopen/driver/lrn_driver.hpp | 2 +- projects/miopen/driver/lstm_verify_gemm.hpp | 2 +- projects/miopen/driver/miopen_Reduction.hpp | 2 +- .../miopen/driver/multimarginloss_driver.hpp | 4 +- projects/miopen/driver/prelu_driver.hpp | 2 +- projects/miopen/driver/reduce_driver.hpp | 2 +- .../driver/reducecalculation_driver.hpp | 4 +- .../miopen/driver/reduceextreme_driver.hpp | 4 +- projects/miopen/driver/rnn_driver.hpp | 2 +- projects/miopen/driver/rnn_seq_driver.hpp | 2 +- projects/miopen/driver/rnn_verify_gemm.hpp | 2 +- projects/miopen/driver/rope_driver.hpp | 4 +- .../miopen/driver/softmarginloss_driver.hpp | 4 +- projects/miopen/driver/softmax_driver.hpp | 2 +- projects/miopen/driver/t5layernorm_driver.hpp | 4 +- .../driver/transformers_adam_w_driver.hpp | 2 +- projects/miopen/miopen_utils/CMakeLists.txt | 38 + .../include/miopen_utils/cpu_bias.hpp | 140 +++ .../include/miopen_utils/cpu_conv.hpp | 514 +++++++++ .../include/miopen_utils/cpu_layernorm.hpp | 216 ++++ .../include/miopen_utils/cpu_reduce_util.hpp | 649 ++++++++++++ .../include/miopen_utils/fusionHost.hpp | 993 +++++++++++++++++ .../include/miopen_utils/gemm.hpp | 120 +++ .../include/miopen_utils/network_data.hpp | 438 ++++++++ .../include/miopen_utils/random.hpp | 62 ++ .../include/miopen_utils/rnn_util.hpp | 305 ++++++ .../include/miopen_utils/serialize.hpp | 129 +++ .../include/miopen_utils/tensor_holder.hpp | 505 +++++++++ .../include/miopen_utils/verify.hpp | 245 +++++ projects/miopen/test/CMakeLists.txt | 4 +- projects/miopen/test/cpu_bias.hpp | 140 +-- projects/miopen/test/cpu_conv.hpp | 514 +-------- projects/miopen/test/cpu_layernorm.hpp | 215 +--- projects/miopen/test/cpu_reduce_util.hpp | 648 +----------- projects/miopen/test/fusionHost.hpp | 995 +----------------- projects/miopen/test/gemm.hpp | 119 +-- projects/miopen/test/network_data.hpp | 437 +------- projects/miopen/test/random.hpp | 63 +- projects/miopen/test/rnn_util.hpp | 304 +----- projects/miopen/test/serialize.hpp | 128 +-- projects/miopen/test/tensor_holder.hpp | 504 +-------- projects/miopen/test/verify.hpp | 244 +---- 61 files changed, 4436 insertions(+), 4342 deletions(-) create mode 100644 projects/miopen/miopen_utils/CMakeLists.txt create mode 100644 projects/miopen/miopen_utils/include/miopen_utils/cpu_bias.hpp create mode 100644 projects/miopen/miopen_utils/include/miopen_utils/cpu_conv.hpp create mode 100644 projects/miopen/miopen_utils/include/miopen_utils/cpu_layernorm.hpp create mode 100644 projects/miopen/miopen_utils/include/miopen_utils/cpu_reduce_util.hpp create mode 100644 projects/miopen/miopen_utils/include/miopen_utils/fusionHost.hpp create mode 100644 projects/miopen/miopen_utils/include/miopen_utils/gemm.hpp create mode 100644 projects/miopen/miopen_utils/include/miopen_utils/network_data.hpp create mode 100644 projects/miopen/miopen_utils/include/miopen_utils/random.hpp create mode 100644 projects/miopen/miopen_utils/include/miopen_utils/rnn_util.hpp create mode 100644 projects/miopen/miopen_utils/include/miopen_utils/serialize.hpp create mode 100644 projects/miopen/miopen_utils/include/miopen_utils/tensor_holder.hpp create mode 100644 projects/miopen/miopen_utils/include/miopen_utils/verify.hpp diff --git a/projects/miopen/CMakeLists.txt b/projects/miopen/CMakeLists.txt index 627ddec85bbd..26bf20fd0690 100644 --- a/projects/miopen/CMakeLists.txt +++ b/projects/miopen/CMakeLists.txt @@ -897,6 +897,7 @@ endif() add_subdirectory(common_utils) add_subdirectory(addkernels) add_subdirectory(src) +add_subdirectory(miopen_utils) if(MIOPEN_BUILD_DRIVER) add_subdirectory(driver) endif() diff --git a/projects/miopen/driver/CBAInferFusion_driver.hpp b/projects/miopen/driver/CBAInferFusion_driver.hpp index 0b63f8fe5af6..8bc25e1ffc58 100644 --- a/projects/miopen/driver/CBAInferFusion_driver.hpp +++ b/projects/miopen/driver/CBAInferFusion_driver.hpp @@ -36,9 +36,9 @@ #include "util_driver.hpp" #include "conv_common.hpp" -#include "../test/verify.hpp" -#include "../test/cpu_conv.hpp" -#include "../test/cpu_bias.hpp" +#include +#include +#include #include #include diff --git a/projects/miopen/driver/CMakeLists.txt b/projects/miopen/driver/CMakeLists.txt index 693a3d47d599..835d6437b650 100644 --- a/projects/miopen/driver/CMakeLists.txt +++ b/projects/miopen/driver/CMakeLists.txt @@ -74,7 +74,7 @@ endif() add_dependencies(MIOpenDriver generate_kernels) target_include_directories(MIOpenDriver PRIVATE ../src/kernels) # MIOpen_with_plugins ensures CK plugin .so's are built alongside MIOpenDriver -target_link_libraries(MIOpenDriver PRIVATE MIOpen_with_plugins Threads::Threads roc::rocrand nlohmann_json::nlohmann_json miopen_common_utils) +target_link_libraries(MIOpenDriver PRIVATE MIOpen_with_plugins Threads::Threads roc::rocrand nlohmann_json::nlohmann_json miopen_common_utils miopen_utils) if(NOT MIOPEN_EMBED_DB STREQUAL "") target_link_libraries(MIOpenDriver PRIVATE $ ) endif() diff --git a/projects/miopen/driver/adam_driver.hpp b/projects/miopen/driver/adam_driver.hpp index f0c0258c8241..6c1984c44e87 100644 --- a/projects/miopen/driver/adam_driver.hpp +++ b/projects/miopen/driver/adam_driver.hpp @@ -32,7 +32,7 @@ #include "tensor_driver.hpp" #include "timer.hpp" -#include "../test/verify.hpp" +#include #include #include diff --git a/projects/miopen/driver/addlayernorm_driver.hpp b/projects/miopen/driver/addlayernorm_driver.hpp index effdc90c6127..a1bac6125dfc 100644 --- a/projects/miopen/driver/addlayernorm_driver.hpp +++ b/projects/miopen/driver/addlayernorm_driver.hpp @@ -26,8 +26,8 @@ #ifndef GUARD_MIOPEN_ADDLAYERNORM_DRIVER_HPP #define GUARD_MIOPEN_ADDLAYERNORM_DRIVER_HPP -#include <../test/tensor_holder.hpp> -#include <../test/verify.hpp> +#include +#include #include "InputFlags.hpp" #include "driver.hpp" #include "random.hpp" diff --git a/projects/miopen/driver/bn_driver.hpp b/projects/miopen/driver/bn_driver.hpp index 29cdfd970356..82802f8bd965 100644 --- a/projects/miopen/driver/bn_driver.hpp +++ b/projects/miopen/driver/bn_driver.hpp @@ -35,9 +35,9 @@ #include "util_driver.hpp" #include "rocrand_wrapper.hpp" -#include "../test/verify.hpp" -#include "../test/random.hpp" -#include "../test/fusionHost.hpp" +#include +#include +#include #include #include diff --git a/projects/miopen/driver/cat_driver.hpp b/projects/miopen/driver/cat_driver.hpp index f9a675440c15..a4e6804f9aad 100644 --- a/projects/miopen/driver/cat_driver.hpp +++ b/projects/miopen/driver/cat_driver.hpp @@ -18,8 +18,8 @@ #include #include #include -#include <../test/tensor_holder.hpp> -#include <../test/verify.hpp> +#include +#include #include #ifndef MLO_CATHOST_H_ diff --git a/projects/miopen/driver/conv_driver.hpp b/projects/miopen/driver/conv_driver.hpp index fcdbdbbd2ea6..77010d71e87a 100644 --- a/projects/miopen/driver/conv_driver.hpp +++ b/projects/miopen/driver/conv_driver.hpp @@ -28,10 +28,10 @@ #include #include -#include <../test/cpu_bias.hpp> -#include <../test/cpu_conv.hpp> -#include <../test/tensor_holder.hpp> -#include <../test/verify.hpp> +#include +#include +#include +#include #include #include diff --git a/projects/miopen/driver/conv_verify.hpp b/projects/miopen/driver/conv_verify.hpp index ae315843f01e..31d611bce134 100644 --- a/projects/miopen/driver/conv_verify.hpp +++ b/projects/miopen/driver/conv_verify.hpp @@ -27,7 +27,7 @@ #define GUARD_MIOPEN_CONV_VERIFY_HPP #include -#include "../test/gemm.hpp" +#include template diff --git a/projects/miopen/driver/ctc_driver.hpp b/projects/miopen/driver/ctc_driver.hpp index 2b8e64a8f79a..85aecb3264d3 100644 --- a/projects/miopen/driver/ctc_driver.hpp +++ b/projects/miopen/driver/ctc_driver.hpp @@ -35,7 +35,7 @@ #include -#include <../test/verify.hpp> +#include #include #include diff --git a/projects/miopen/driver/driver.hpp b/projects/miopen/driver/driver.hpp index 5bb698554566..2ebbcc2a4000 100644 --- a/projects/miopen/driver/driver.hpp +++ b/projects/miopen/driver/driver.hpp @@ -39,7 +39,7 @@ #include #include #include -#include <../test/tensor_holder.hpp> +#include #include "util_driver.hpp" #include "rocrand_wrapper.hpp" using half = half_float::half; diff --git a/projects/miopen/driver/dropout_driver.hpp b/projects/miopen/driver/dropout_driver.hpp index 84d942155a08..0016340fd60e 100644 --- a/projects/miopen/driver/dropout_driver.hpp +++ b/projects/miopen/driver/dropout_driver.hpp @@ -34,7 +34,7 @@ #include "util_driver.hpp" #include "util_file.hpp" -#include <../test/verify.hpp> +#include #include #include diff --git a/projects/miopen/driver/gemm_driver.hpp b/projects/miopen/driver/gemm_driver.hpp index d89a09a56644..8383b01ec22f 100644 --- a/projects/miopen/driver/gemm_driver.hpp +++ b/projects/miopen/driver/gemm_driver.hpp @@ -34,7 +34,7 @@ #include "random.hpp" #include "util_driver.hpp" -#include <../test/verify.hpp> +#include #include #include diff --git a/projects/miopen/driver/getitem_driver.hpp b/projects/miopen/driver/getitem_driver.hpp index 52a5bc262f82..55b0dfcd296c 100644 --- a/projects/miopen/driver/getitem_driver.hpp +++ b/projects/miopen/driver/getitem_driver.hpp @@ -40,8 +40,8 @@ #include #include #include -#include <../test/tensor_holder.hpp> -#include <../test/verify.hpp> +#include +#include template int32_t mloGetitemBackwardRunHost(miopenTensorDescriptor_t dyDesc, diff --git a/projects/miopen/driver/glu_driver.hpp b/projects/miopen/driver/glu_driver.hpp index 38deb2d69e78..63bf7188db4d 100644 --- a/projects/miopen/driver/glu_driver.hpp +++ b/projects/miopen/driver/glu_driver.hpp @@ -38,7 +38,7 @@ #include #include -#include <../test/verify.hpp> +#include #include #include diff --git a/projects/miopen/driver/groupnorm_driver.hpp b/projects/miopen/driver/groupnorm_driver.hpp index 3773654c842d..97553dd3c13e 100644 --- a/projects/miopen/driver/groupnorm_driver.hpp +++ b/projects/miopen/driver/groupnorm_driver.hpp @@ -32,7 +32,7 @@ #include "mloGroupNormHost.hpp" #include "tensor_driver.hpp" #include "timer.hpp" -#include <../test/verify.hpp> +#include #include #include #include @@ -40,7 +40,7 @@ #include #include #include -#include <../test/tensor_holder.hpp> +#include #include "random.hpp" template diff --git a/projects/miopen/driver/gru_verify_gemm.hpp b/projects/miopen/driver/gru_verify_gemm.hpp index e07d6eab0bff..237d311b1c29 100644 --- a/projects/miopen/driver/gru_verify_gemm.hpp +++ b/projects/miopen/driver/gru_verify_gemm.hpp @@ -28,7 +28,7 @@ #include "dropout_gpu_emulator.hpp" -#include <../test/rnn_util.hpp> +#include #include #include diff --git a/projects/miopen/driver/kthvalue_driver.hpp b/projects/miopen/driver/kthvalue_driver.hpp index 75f7e5b535b2..8cbfa302bf14 100644 --- a/projects/miopen/driver/kthvalue_driver.hpp +++ b/projects/miopen/driver/kthvalue_driver.hpp @@ -30,8 +30,8 @@ #include "timer.hpp" #include "random.hpp" -#include <../test/tensor_holder.hpp> -#include <../test/verify.hpp> +#include +#include #include #include diff --git a/projects/miopen/driver/layernorm_driver.hpp b/projects/miopen/driver/layernorm_driver.hpp index 6f6662f202f6..042e8a7164ea 100644 --- a/projects/miopen/driver/layernorm_driver.hpp +++ b/projects/miopen/driver/layernorm_driver.hpp @@ -26,9 +26,9 @@ #ifndef GUARD_MIOPEN_LAYERNORM_DRIVER_HPP #define GUARD_MIOPEN_LAYERNORM_DRIVER_HPP -#include <../test/tensor_holder.hpp> -#include <../test/verify.hpp> -#include <../test/cpu_layernorm.hpp> +#include +#include +#include #include "InputFlags.hpp" #include "driver.hpp" #include "miopen/miopen.h" diff --git a/projects/miopen/driver/lrn_driver.hpp b/projects/miopen/driver/lrn_driver.hpp index c1645621acd4..2f164aad38b1 100644 --- a/projects/miopen/driver/lrn_driver.hpp +++ b/projects/miopen/driver/lrn_driver.hpp @@ -12,7 +12,7 @@ #include "timer.hpp" #include "util_driver.hpp" -#include "../test/verify.hpp" +#include #include #include diff --git a/projects/miopen/driver/lstm_verify_gemm.hpp b/projects/miopen/driver/lstm_verify_gemm.hpp index fb98d5616ad5..a761779738f4 100644 --- a/projects/miopen/driver/lstm_verify_gemm.hpp +++ b/projects/miopen/driver/lstm_verify_gemm.hpp @@ -28,7 +28,7 @@ #include "dropout_gpu_emulator.hpp" -#include <../test/rnn_util.hpp> +#include #include #include diff --git a/projects/miopen/driver/miopen_Reduction.hpp b/projects/miopen/driver/miopen_Reduction.hpp index 3aee4e375c97..0fc05603bf2e 100644 --- a/projects/miopen/driver/miopen_Reduction.hpp +++ b/projects/miopen/driver/miopen_Reduction.hpp @@ -31,7 +31,7 @@ #include #include -#include "../test/cpu_reduce_util.hpp" +#include #include "tensor_driver.hpp" diff --git a/projects/miopen/driver/multimarginloss_driver.hpp b/projects/miopen/driver/multimarginloss_driver.hpp index dab040ef3ef3..5d2a60db4507 100644 --- a/projects/miopen/driver/multimarginloss_driver.hpp +++ b/projects/miopen/driver/multimarginloss_driver.hpp @@ -36,8 +36,8 @@ #include #include #include -#include <../test/tensor_holder.hpp> -#include <../test/verify.hpp> +#include +#include #include template diff --git a/projects/miopen/driver/prelu_driver.hpp b/projects/miopen/driver/prelu_driver.hpp index 761f97cc64eb..cab2eb811885 100644 --- a/projects/miopen/driver/prelu_driver.hpp +++ b/projects/miopen/driver/prelu_driver.hpp @@ -31,7 +31,7 @@ #include "tensor_driver.hpp" #include "timer.hpp" -#include <../test/verify.hpp> +#include #include diff --git a/projects/miopen/driver/reduce_driver.hpp b/projects/miopen/driver/reduce_driver.hpp index ab1c50e806f1..6300fa32a690 100644 --- a/projects/miopen/driver/reduce_driver.hpp +++ b/projects/miopen/driver/reduce_driver.hpp @@ -35,7 +35,7 @@ #include "util_driver.hpp" #include "util_file.hpp" -#include "../test/verify.hpp" +#include #include #include diff --git a/projects/miopen/driver/reducecalculation_driver.hpp b/projects/miopen/driver/reducecalculation_driver.hpp index 200196950997..738fb6032f3c 100644 --- a/projects/miopen/driver/reducecalculation_driver.hpp +++ b/projects/miopen/driver/reducecalculation_driver.hpp @@ -40,8 +40,8 @@ #include #include #include -#include <../test/tensor_holder.hpp> -#include <../test/verify.hpp> +#include +#include #include "../src/kernels/MIOpenReduceCalculation.hpp" #ifndef MLO_REDUCE_CALCULATIONMHOST_H_ diff --git a/projects/miopen/driver/reduceextreme_driver.hpp b/projects/miopen/driver/reduceextreme_driver.hpp index a06f5288a164..b2caf5dda398 100644 --- a/projects/miopen/driver/reduceextreme_driver.hpp +++ b/projects/miopen/driver/reduceextreme_driver.hpp @@ -39,8 +39,8 @@ #include #include #include -#include <../test/tensor_holder.hpp> -#include <../test/verify.hpp> +#include +#include #include "../src/kernels/MIOpenReduceExtreme.hpp" template diff --git a/projects/miopen/driver/rnn_driver.hpp b/projects/miopen/driver/rnn_driver.hpp index 4cd47739f5ea..7f35be320155 100644 --- a/projects/miopen/driver/rnn_driver.hpp +++ b/projects/miopen/driver/rnn_driver.hpp @@ -36,7 +36,7 @@ #include "util_driver.hpp" #include "util_file.hpp" -#include <../test/verify.hpp> +#include #include #include diff --git a/projects/miopen/driver/rnn_seq_driver.hpp b/projects/miopen/driver/rnn_seq_driver.hpp index 1ac9b23c0b4c..7babcfd00273 100644 --- a/projects/miopen/driver/rnn_seq_driver.hpp +++ b/projects/miopen/driver/rnn_seq_driver.hpp @@ -36,7 +36,7 @@ #include "util_driver.hpp" #include "util_file.hpp" -#include <../test/verify.hpp> +#include #include #include diff --git a/projects/miopen/driver/rnn_verify_gemm.hpp b/projects/miopen/driver/rnn_verify_gemm.hpp index b1fa42c3503b..04b73111513d 100644 --- a/projects/miopen/driver/rnn_verify_gemm.hpp +++ b/projects/miopen/driver/rnn_verify_gemm.hpp @@ -28,7 +28,7 @@ #include "dropout_gpu_emulator.hpp" -#include <../test/rnn_util.hpp> +#include #include #include diff --git a/projects/miopen/driver/rope_driver.hpp b/projects/miopen/driver/rope_driver.hpp index bbad2370bf4e..27f0a03126ac 100644 --- a/projects/miopen/driver/rope_driver.hpp +++ b/projects/miopen/driver/rope_driver.hpp @@ -39,8 +39,8 @@ #include #include #include -#include <../test/tensor_holder.hpp> -#include <../test/verify.hpp> +#include +#include template int32_t mloRoPEForwardRunHost(miopenTensorDescriptor_t xDesc, diff --git a/projects/miopen/driver/softmarginloss_driver.hpp b/projects/miopen/driver/softmarginloss_driver.hpp index 3a6b095eaa0e..6589abd88db9 100644 --- a/projects/miopen/driver/softmarginloss_driver.hpp +++ b/projects/miopen/driver/softmarginloss_driver.hpp @@ -35,8 +35,8 @@ #include #include #include -#include <../test/tensor_holder.hpp> -#include <../test/verify.hpp> +#include +#include #include template diff --git a/projects/miopen/driver/softmax_driver.hpp b/projects/miopen/driver/softmax_driver.hpp index e147191b2deb..52f42fdfd5f8 100644 --- a/projects/miopen/driver/softmax_driver.hpp +++ b/projects/miopen/driver/softmax_driver.hpp @@ -11,7 +11,7 @@ #include "timer.hpp" #include "util_driver.hpp" -#include <../test/verify.hpp> +#include #include #include diff --git a/projects/miopen/driver/t5layernorm_driver.hpp b/projects/miopen/driver/t5layernorm_driver.hpp index c8517ad525d8..b57fe456403f 100644 --- a/projects/miopen/driver/t5layernorm_driver.hpp +++ b/projects/miopen/driver/t5layernorm_driver.hpp @@ -26,8 +26,8 @@ #ifndef GUARD_MIOPEN_T5LAYERNORM_DRIVER_HPP #define GUARD_MIOPEN_T5LAYERNORM_DRIVER_HPP -#include <../test/tensor_holder.hpp> -#include <../test/verify.hpp> +#include +#include #include "InputFlags.hpp" #include "driver.hpp" #include "random.hpp" diff --git a/projects/miopen/driver/transformers_adam_w_driver.hpp b/projects/miopen/driver/transformers_adam_w_driver.hpp index dfd82a3284c6..a1cd81f2eb53 100644 --- a/projects/miopen/driver/transformers_adam_w_driver.hpp +++ b/projects/miopen/driver/transformers_adam_w_driver.hpp @@ -32,7 +32,7 @@ #include "tensor_driver.hpp" #include "timer.hpp" -#include "../test/verify.hpp" +#include #include #include diff --git a/projects/miopen/miopen_utils/CMakeLists.txt b/projects/miopen/miopen_utils/CMakeLists.txt new file mode 100644 index 000000000000..47e61c063411 --- /dev/null +++ b/projects/miopen/miopen_utils/CMakeLists.txt @@ -0,0 +1,38 @@ +################################################################################ +# +# MIT License +# +# Copyright (c) 2025 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +################################################################################ + +# Utility library for MIOpen test/verification code shared by MIOpenDriver and tests. +# Depends on common_utils and the MIOpen public API (miopen.h). +# Phase 1: May still use MIOpen internal headers temporarily. + +add_library(miopen_utils INTERFACE) +add_library(MIOpen::miopen_utils ALIAS miopen_utils) + +target_include_directories(miopen_utils INTERFACE + $ +) + +target_link_libraries(miopen_utils INTERFACE miopen_common_utils) diff --git a/projects/miopen/miopen_utils/include/miopen_utils/cpu_bias.hpp b/projects/miopen/miopen_utils/include/miopen_utils/cpu_bias.hpp new file mode 100644 index 000000000000..0125ca37d298 --- /dev/null +++ b/projects/miopen/miopen_utils/include/miopen_utils/cpu_bias.hpp @@ -0,0 +1,140 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2019 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#ifndef GUARD_CPU_BIAS_HPP +#define GUARD_CPU_BIAS_HPP + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +template +void cpu_bias_forward_impl(tensor& out, const tensor& bias) +{ + assert(out.desc.GetNumDims() == NSpatialDim + 2 and bias.desc.GetNumDims() == NSpatialDim + 2); + assert( + bias.desc.GetLengths()[0] == 1 && bias.desc.GetLengths()[1] == out.desc.GetLengths()[1] && + std::all_of(bias.desc.GetLengths().begin() + 2, bias.desc.GetLengths().end(), [](auto v) { + return v == 1; + })); + + out.par_for_each([&](auto out_n_id, auto out_k_id, auto... out_spatial_id_pack) { + out(out_n_id, out_k_id, out_spatial_id_pack...) = + double(out(out_n_id, out_k_id, out_spatial_id_pack...)) + double(bias.data[out_k_id]); + }); +} + +template +void cpu_bias_backward_data_impl(const tensor& out, tensor& bias) +{ + assert(out.desc.GetNumDims() == NSpatialDim + 2 and bias.desc.GetNumDims() == NSpatialDim + 2); + assert( + bias.desc.GetLengths()[0] == 1 && bias.desc.GetLengths()[1] == out.desc.GetLengths()[0] && + std::all_of(bias.desc.GetLengths().begin() + 2, bias.desc.GetLengths().end(), [](auto v) { + return v == 1; + })); + + std::size_t out_n_len = out.desc.GetLengths()[0]; + std::size_t out_k_len = out.desc.GetLengths()[1]; + + std::array out_spatial_len{}; + std::copy_n(out.desc.GetLengths().begin() + 2, NSpatialDim, out_spatial_len.begin()); + + miopen::par_ford(out_k_len)([&](auto out_k_id) { + auto ford_out_n_spatial = + miopen::unpacker(miopen::prepender(miopen::ford, out_n_len))(out_spatial_len); + + double acc = 0; + ford_out_n_spatial([&](auto out_n_id, auto... out_spatial_id_pack) { + acc += double(out(out_n_id, out_k_id, out_spatial_id_pack...)); + }); + + bias.data[out_k_id] = acc; + }); +} + +template +void cpu_bias_forward(tensor& out, const tensor& bias) +{ + switch(out.desc.GetNumDims()) + { + case 3: { + cpu_bias_forward_impl<1>(out, bias); + break; + } + case 4: { + cpu_bias_forward_impl<2>(out, bias); + break; + } + case 5: { + cpu_bias_forward_impl<3>(out, bias); + break; + } + case 6: { + cpu_bias_forward_impl<4>(out, bias); + break; + } + default: { + MIOPEN_THROW("not belong to any case"); + } + } +} + +template +void cpu_bias_backward_data(const tensor& out, tensor& bias) +{ + switch(out.desc.GetNumDims()) + { + case 3: { + cpu_bias_backward_data_impl<1>(out, bias); + break; + } + case 4: { + cpu_bias_backward_data_impl<2>(out, bias); + break; + } + case 5: { + cpu_bias_backward_data_impl<3>(out, bias); + break; + } + case 6: { + cpu_bias_backward_data_impl<4>(out, bias); + break; + } + default: { + MIOPEN_THROW("not belong to any case"); + } + } +} +#endif diff --git a/projects/miopen/miopen_utils/include/miopen_utils/cpu_conv.hpp b/projects/miopen/miopen_utils/include/miopen_utils/cpu_conv.hpp new file mode 100644 index 000000000000..2ef2c5b31236 --- /dev/null +++ b/projects/miopen/miopen_utils/include/miopen_utils/cpu_conv.hpp @@ -0,0 +1,514 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2019 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#ifndef GUARD_CPU_CONV_HPP +#define GUARD_CPU_CONV_HPP + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +template +static constexpr auto make_array(T x, Ts... xs) +{ + return std::array{{x, xs...}}; +} + +template +struct PassThru +{ + T operator()(T t) { return t; } +}; + +template +struct cpu_convolution_acc_type +{ + using type = double; // default using double as accumulator +}; + +template <> +struct cpu_convolution_acc_type +{ + using type = int32_t; +}; + +template <> +struct cpu_convolution_acc_type +{ + using type = double; +}; + +template +void cpu_convolution_forward_impl(const tensor& in, + const tensor& wei, + tensor& out, + const Range& pads, + const Range& strides, + const Range& dilations, + std::size_t group_count, + FI fi = {}, + FW fw = {}) +{ + static_assert(ConvDim > 0, "wrong! convolution dim should be larger than 0"); + assert(in.desc.GetNumDims() == ConvDim + 2 and wei.desc.GetNumDims() == ConvDim + 2 and + out.desc.GetNumDims() == ConvDim + 2 and pads.size() == ConvDim and + strides.size() == ConvDim and dilations.size() == ConvDim); + std::size_t out_n_len = out.desc.GetLengths()[0]; + + std::size_t wei_k_len = wei.desc.GetLengths()[0]; + std::size_t wei_c_len = wei.desc.GetLengths()[1]; + + std::size_t vector_len = in.desc.GetVectorLength(); + + std::array in_spatial_len{}; + std::array wei_spatial_len{}; + std::array out_spatial_len{}; + + std::copy_n(in.desc.GetLengths().begin() + 2, ConvDim, in_spatial_len.begin()); + std::copy_n(wei.desc.GetLengths().begin() + 2, ConvDim, wei_spatial_len.begin()); + std::copy_n(out.desc.GetLengths().begin() + 2, ConvDim, out_spatial_len.begin()); + + if(wei.desc.GetLayout_str() == "CHWNc") + { + wei_c_len = wei.desc.GetLengths()[0]; + std::copy_n(wei.desc.GetLengths().begin() + 1, ConvDim, wei_spatial_len.begin()); + wei_k_len = wei.desc.GetLengths()[3]; + } + + std::size_t wei_k_len_per_group = wei_k_len / group_count; + + // f(x0, x1, xs...) + // f1(xs...) = f(x0, x1, xs...) + // f2(xs_array) = f1(xs...) + auto par_ford_out_nk_spatial = miopen::unpacker( + miopen::prepender(miopen::par_ford, out_n_len, wei_k_len))(out_spatial_len); + + par_ford_out_nk_spatial([&](std::size_t out_n_id, + std::size_t out_k_id, + auto... out_spatial_id_pack) { + auto out_spatial_id = make_array(out_spatial_id_pack...); + + std::size_t group_id = out_k_id / wei_k_len_per_group; + Tacc acc = 0; + + miopen::ford(wei_c_len)([&](std::size_t wei_c_id) { + std::size_t in_c_id = group_id * wei_c_len + wei_c_id; + + auto ford_wei_spatial = miopen::unpacker(miopen::ford)(wei_spatial_len); + + ford_wei_spatial([&](auto... wei_spatial_id_pack) { + auto wei_spatial_id = make_array(wei_spatial_id_pack...); + + std::array in_spatial_id{}; + + for(std::size_t i = 0; i < ConvDim; ++i) + { + in_spatial_id[i] = + out_spatial_id[i] * strides[i] + wei_spatial_id[i] * dilations[i] - pads[i]; + } + bool out_of_bound = false; + for(std::size_t i = 0; i < ConvDim; ++i) + { + out_of_bound = out_of_bound or + (in_spatial_id[i] < 0 or in_spatial_id[i] >= in_spatial_len[i]); + } + if(!out_of_bound) + { + if(vector_len > 1) + { + std::array in_id{}; + in_id[1] = out_n_id; + in_id[2] = in_c_id; + std::copy_n(in_spatial_id.begin(), ConvDim, in_id.begin() + 3); + for(std::size_t i = 0; i < vector_len; i++) + { + in_id[0] = i; + acc += Tacc(in(in_id)) * + Tacc(wei(i, out_k_id, wei_c_id, wei_spatial_id_pack...)); + } + } + else + { + std::array in_id{}; + in_id[0] = out_n_id; + in_id[1] = in_c_id; + std::copy_n(in_spatial_id.begin(), ConvDim, in_id.begin() + 2); + Tacc tmp1 = static_cast(fi(in(in_id))); + Tacc tmp2 = + static_cast(fw(wei(out_k_id, wei_c_id, wei_spatial_id_pack...))); + acc += tmp1 * tmp2; + } + } + }); + }); + if(vector_len > 1) + { + out(out_k_id % vector_len, out_n_id, out_k_id / vector_len, out_spatial_id_pack...) = + static_cast(acc); + } + else + { + out(out_n_id, out_k_id, out_spatial_id_pack...) = static_cast(acc); + } + }); +} + +template +void cpu_convolution_backward_data_impl(tensor& in, + const tensor& wei, + const tensor& out, + const Range& pads, + const Range& strides, + const Range& dilations, + std::size_t group_count, + FW fw = {}, + FO fo = {}) +{ + static_assert(ConvDim > 0, "wrong! convolution dim should be larger than 0"); + assert(in.desc.GetNumDims() == ConvDim + 2 and wei.desc.GetNumDims() == ConvDim + 2 and + out.desc.GetNumDims() == ConvDim + 2 and pads.size() == ConvDim and + strides.size() == ConvDim and dilations.size() == ConvDim); + + std::size_t in_n_len = in.desc.GetLengths()[0]; + std::size_t in_c_len = in.desc.GetLengths()[1]; + + std::size_t wei_k_len = wei.desc.GetLengths()[0]; + std::size_t wei_c_len = wei.desc.GetLengths()[1]; + + std::size_t wei_k_len_per_group = wei_k_len / group_count; + + std::array in_spatial_len{}; + std::array wei_spatial_len{}; + std::array out_spatial_len{}; + + std::copy_n(in.desc.GetLengths().begin() + 2, ConvDim, in_spatial_len.begin()); + std::copy_n(wei.desc.GetLengths().begin() + 2, ConvDim, wei_spatial_len.begin()); + std::copy_n(out.desc.GetLengths().begin() + 2, ConvDim, out_spatial_len.begin()); + + auto par_ford_in_nc_spatial = + miopen::unpacker(miopen::prepender(miopen::par_ford, in_n_len, in_c_len))(in_spatial_len); + + par_ford_in_nc_spatial( + [&](std::size_t in_n_id, std::size_t in_c_id, auto... in_spatial_id_pack) { + auto in_spatial_id = make_array(in_spatial_id_pack...); + + std::size_t group_id = in_c_id / wei_c_len; + + Tacc acc = 0; + + miopen::ford(wei_k_len_per_group)([&](std::size_t wei_k_id_inside_group) { + auto ford_wei_spatial = miopen::unpacker(miopen::ford)(wei_spatial_len); + + ford_wei_spatial([&](auto... wei_spatial_id_pack) { + auto wei_spatial_id = make_array(wei_spatial_id_pack...); + + std::array out_spatial_id_{}; + std::array out_spatial_id{}; + + for(std::size_t i = 0; i < ConvDim; ++i) + { + out_spatial_id_[i] = + pads[i] + in_spatial_id[i] - wei_spatial_id[i] * dilations[i]; + out_spatial_id[i] = out_spatial_id_[i] / strides[i]; + } + + bool use = true; + for(std::size_t i = 0; i < ConvDim; ++i) + { + use &= out_spatial_id_[i] % strides[i] == 0 and out_spatial_id[i] >= 0 and + out_spatial_id[i] < out_spatial_len[i]; + } + + if(use) + { + std::size_t out_k_id = + group_id * wei_k_len_per_group + wei_k_id_inside_group; + std::size_t wei_c_id = in_c_id % wei_c_len; + + std::array out_id{}; + out_id[0] = in_n_id; + out_id[1] = out_k_id; + std::copy_n(out_spatial_id.begin(), ConvDim, out_id.begin() + 2); + Tacc tmp1 = fo(out(out_id)); + Tacc tmp2 = fw(wei(out_k_id, wei_c_id, wei_spatial_id_pack...)); + acc += tmp1 * tmp2; + } + }); + }); + // TODO: Why do we need a no-lint here ? + in(in_n_id, in_c_id, in_spatial_id_pack...) = static_cast(acc); // NOLINT + }); +} + +template +void cpu_convolution_backward_weight_impl(const tensor& in, + tensor& wei, + const tensor& out, + const Range& pads, + const Range& strides, + const Range& dilations, + std::size_t group_count, + FI fi, + FO fo) +{ + static_assert(ConvDim > 0, "wrong! convolution dim should be larger than 0"); + assert(in.desc.GetNumDims() == ConvDim + 2 and wei.desc.GetNumDims() == ConvDim + 2 and + out.desc.GetNumDims() == ConvDim + 2 and pads.size() == ConvDim and + strides.size() == ConvDim and dilations.size() == ConvDim); + + std::size_t out_n_len = out.desc.GetLengths()[0]; + + std::size_t wei_k_len = wei.desc.GetLengths()[0]; + std::size_t wei_c_len = wei.desc.GetLengths()[1]; + + std::size_t wei_k_len_per_group = wei_k_len / group_count; + + std::array in_spatial_len{}; + std::array wei_spatial_len{}; + std::array out_spatial_len{}; + + std::copy_n(in.desc.GetLengths().begin() + 2, ConvDim, in_spatial_len.begin()); + std::copy_n(wei.desc.GetLengths().begin() + 2, ConvDim, wei_spatial_len.begin()); + std::copy_n(out.desc.GetLengths().begin() + 2, ConvDim, out_spatial_len.begin()); + + auto par_ford_wei_kc_spatial = miopen::unpacker( + miopen::prepender(miopen::par_ford, wei_k_len, wei_c_len))(wei_spatial_len); + + par_ford_wei_kc_spatial( + [&](std::size_t wei_k_id, std::size_t wei_c_id, auto... wei_spatial_id_pack) { + auto wei_spatial_id = make_array(wei_spatial_id_pack...); + + std::size_t group_id = wei_k_id / wei_k_len_per_group; + std::size_t in_c_id = group_id * wei_c_len + wei_c_id; + + Tacc acc = 0; + + miopen::ford(out_n_len)([&](std::size_t out_n_id) { + auto ford_out_spatial = miopen::unpacker(miopen::ford)(out_spatial_len); + + ford_out_spatial([&](auto... out_spatial_id_pack) { + auto out_spatial_id = make_array(out_spatial_id_pack...); + + std::array in_spatial_id{}; + + for(std::size_t i = 0; i < ConvDim; ++i) + { + in_spatial_id[i] = out_spatial_id[i] * strides[i] + + wei_spatial_id[i] * dilations[i] - pads[i]; + } + + bool out_of_bound = false; + for(std::size_t i = 0; i < ConvDim; ++i) + { + out_of_bound = out_of_bound or (in_spatial_id[i] < 0 or + in_spatial_id[i] >= in_spatial_len[i]); + } + + if(!out_of_bound) + { + std::array in_id{}; + in_id[0] = out_n_id; + in_id[1] = in_c_id; + std::copy_n(in_spatial_id.begin(), ConvDim, in_id.begin() + 2); + Tacc tmp1 = fi(in(in_id)); + Tacc tmp2 = fo(out(out_n_id, wei_k_id, out_spatial_id_pack...)); + acc += tmp1 * tmp2; + } + }); + + wei(wei_k_id, wei_c_id, wei_spatial_id_pack...) = static_cast(acc); + }); + }); +} + +template , + typename FW = PassThru> +void cpu_convolution_forward(std::size_t spatial_dim, + const tensor& in, + const tensor& wei, + tensor& out, + const Range& pads, + const Range& strides, + const Range& dilations, + std::size_t group_count, + FI fi = {}, + FW fw = {}) +{ + switch(spatial_dim) + { + case 1: { + cpu_convolution_forward_impl<1, Tacc>( + in, wei, out, pads, strides, dilations, group_count, fi, fw); + break; + } + case 2: { + cpu_convolution_forward_impl<2, Tacc>( + in, wei, out, pads, strides, dilations, group_count, fi, fw); + break; + } + case 3: { + cpu_convolution_forward_impl<3, Tacc>( + in, wei, out, pads, strides, dilations, group_count, fi, fw); + break; + } + case 4: { + cpu_convolution_forward_impl<4, Tacc>( + in, wei, out, pads, strides, dilations, group_count, fi, fw); + break; + } + default: { + MIOPEN_THROW("not belong to any case"); + } + } +} + +template , + typename FO = PassThru> +void cpu_convolution_backward_data(std::size_t spatial_dim, + tensor& in, + const tensor& wei, + const tensor& out, + const Range& pads, + const Range& strides, + const Range& dilations, + std::size_t group_count, + FW fw = {}, + FO fo = {}) +{ + switch(spatial_dim) + { + case 1: { + cpu_convolution_backward_data_impl<1, Tacc>( + in, wei, out, pads, strides, dilations, group_count, fw, fo); + break; + } + case 2: { + cpu_convolution_backward_data_impl<2, Tacc>( + in, wei, out, pads, strides, dilations, group_count, fw, fo); + break; + } + case 3: { + cpu_convolution_backward_data_impl<3, Tacc>( + in, wei, out, pads, strides, dilations, group_count, fw, fo); + break; + } + case 4: { + cpu_convolution_backward_data_impl<4, Tacc>( + in, wei, out, pads, strides, dilations, group_count, fw, fo); + break; + } + default: { + MIOPEN_THROW("not belong to any case"); + } + } +} + +template , + typename FO = PassThru> +void cpu_convolution_backward_weight(std::size_t spatial_dim, + const tensor& in, + tensor& wei, + const tensor& out, + const Range& pads, + const Range& strides, + const Range& dilations, + std::size_t group_count, + FI fi = {}, + FO fo = {}) +{ + switch(spatial_dim) + { + case 1: { + cpu_convolution_backward_weight_impl<1, Tacc>( + in, wei, out, pads, strides, dilations, group_count, fi, fo); + break; + } + case 2: { + cpu_convolution_backward_weight_impl<2, Tacc>( + in, wei, out, pads, strides, dilations, group_count, fi, fo); + break; + } + case 3: { + cpu_convolution_backward_weight_impl<3, Tacc>( + in, wei, out, pads, strides, dilations, group_count, fi, fo); + break; + } + case 4: { + cpu_convolution_backward_weight_impl<4, Tacc>( + in, wei, out, pads, strides, dilations, group_count, fi, fo); + break; + } + default: { + MIOPEN_THROW("not belong to any case"); + } + } +} +#endif diff --git a/projects/miopen/miopen_utils/include/miopen_utils/cpu_layernorm.hpp b/projects/miopen/miopen_utils/include/miopen_utils/cpu_layernorm.hpp new file mode 100644 index 000000000000..0a6ab5556865 --- /dev/null +++ b/projects/miopen/miopen_utils/include/miopen_utils/cpu_layernorm.hpp @@ -0,0 +1,216 @@ +// Copyright © Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT +#ifndef GUARD_CPU_CONV_HPP +#define GUARD_CPU_CONV_HPP + +#include + +template +void cpu_layernorm_forward(tensor input, + tensor weight, + tensor bias, + tensor& ref_output, + tensor& ref_mean, + tensor& ref_rstd, + float eps, + int32_t dim, + miopenNormMode_t mode, + bool use_multithread = false) +{ + auto layout = input.desc.GetLayoutEnum(); + size_t stride = 1; + if(dim > 1 && layout.has_value() && + (layout.value() == miopenTensorNHWC || layout.value() == miopenTensorNDHWC)) + { + stride = input.desc.GetLengths()[1]; // stride = C + } + + auto dims = input.desc.GetLengths(); + size_t outer_size = 1; + size_t inner_size = 1; + for(size_t i = 0; i < dims.size(); ++i) + { + if(i < dim) + { + if(!(stride > 1 && i == 1)) + { + outer_size *= dims[i]; + } + } + else + { + inner_size *= dims[i]; + } + } + + size_t min_grain = use_multithread ? 8 : outer_size; + miopen::par_for(outer_size, min_grain, [&](int32_t o) { + miopen::ford(stride)([&](int32_t s) { + double mean_v = 0.0; + double var_v = 0.0; + + miopen::ford(inner_size)([&](int32_t i) { + double tmp = static_cast(input[o * inner_size * stride + i * stride + s]); + mean_v += tmp; + var_v += tmp * tmp; + }); + + mean_v = mean_v / inner_size; + var_v = var_v / inner_size - mean_v * mean_v; + double rstd_v = 1.0 / sqrt(var_v + eps); + + ref_mean[o * stride + s] = static_cast(mean_v); + ref_rstd[o * stride + s] = static_cast(rstd_v); + + miopen::ford(inner_size)([&](int32_t i) { + double weight_v = + (mode == MIOPEN_ELEMENTWISE_AFFINE) ? 1.0 : static_cast(weight[i]); + double bias_v = + (mode == MIOPEN_ELEMENTWISE_AFFINE) ? 0.0 : static_cast(bias[i]); + + ref_output[o * inner_size * stride + i * stride + s] = static_cast( + (static_cast(input[o * inner_size * stride + i * stride + s]) - + mean_v) * + rstd_v * weight_v + + bias_v); + }); + }); + }); +} + +template +void cpu_layernorm_backward(tensor dy, + tensor x, + tensor weight, + tensor mean, + tensor rstd, + tensor& ref_dx, + int32_t dim, + miopenNormMode_t mode, + bool use_multithread = false) +{ + auto layout = dy.desc.GetLayoutEnum(); + size_t stride = 1; + if(dim > 1 && (layout == miopenTensorNHWC || layout == miopenTensorNDHWC)) + { + stride = dy.desc.GetLengths()[1]; // stride = C + } + + auto dims = dy.desc.GetLengths(); + size_t outer_size = 1; + size_t inner_size = 1; + for(size_t i = 0; i < dims.size(); ++i) + { + if(i < dim) + { + if(!(stride > 1 && i == 1)) + { + outer_size *= dims[i]; + } + } + else + { + inner_size *= dims[i]; + } + } + + size_t min_grain = use_multithread ? 8 : outer_size; + miopen::par_for(outer_size, min_grain, [&](int32_t o) { + miopen::ford(stride)([&](int32_t s) { + double sum_dy_weight = 0.0; + double sum_dy_weight_x = 0.0; + + miopen::ford(inner_size)([&](int32_t i) { + double pweight = + (mode == MIOPEN_ELEMENTWISE_AFFINE) ? 1.0 : static_cast(weight[i]); + double pdy = (dy.GetSize() != 0) + ? static_cast(dy[o * inner_size * stride + i * stride + s]) + : 0.0; + double px = static_cast(x[o * inner_size * stride + i * stride + s]); + + sum_dy_weight += pdy * pweight; + sum_dy_weight_x += pdy * px * pweight; + }); + + double scale = 1.0 / static_cast(inner_size); + double prstd = static_cast(rstd[o * stride + s]); + double pmean = static_cast(mean[o * stride + s]); + double a = prstd * prstd * prstd * scale * (sum_dy_weight_x - sum_dy_weight * pmean); + double b = prstd * sum_dy_weight * scale - a * pmean; + + miopen::ford(inner_size)([&](int32_t i) { + double pweight = + (mode == MIOPEN_ELEMENTWISE_AFFINE) ? 1.0 : static_cast(weight[i]); + double pdy = (dy.GetSize() != 0) + ? static_cast(dy[o * inner_size * stride + i * stride + s]) + : 0.0; + double val = prstd * pdy * pweight - + a * static_cast(x[o * inner_size * stride + i * stride + s]) - + b; + + ref_dx[o * inner_size * stride + i * stride + s] = static_cast(val); + }); + }); + }); +} + +template +void cpu_layernorm_backward_weight_bias(tensor dy, + tensor x, + tensor mean, + tensor rstd, + tensor& ref_dw, + tensor& ref_db, + int32_t dim, + bool use_multithread = false) +{ + auto layout = dy.desc.GetLayoutEnum(); + size_t stride = 1; + if(dim > 1 && (layout == miopenTensorNHWC || layout == miopenTensorNDHWC)) + { + stride = dy.desc.GetLengths()[1]; // stride = C + } + + auto dims = dy.desc.GetLengths(); + size_t outer_size = 1; + size_t inner_size = 1; + for(size_t i = 0; i < dims.size(); ++i) + { + if(i < dim) + { + if(!(stride > 1 && i == 1)) + { + outer_size *= dims[i]; + } + } + else + { + inner_size *= dims[i]; + } + } + + size_t min_grain = use_multithread ? 8 : inner_size; + miopen::par_for(inner_size, min_grain, [&](int32_t i) { + double sum_dw = 0.0; + double sum_db = 0.0; + + miopen::ford(stride)([&](int32_t s) { + miopen::ford(outer_size)([&](int32_t o) { + double prstd = static_cast(rstd[o * stride + s]); + double pmean = static_cast(mean[o * stride + s]); + double pdy = (dy.GetSize() != 0) + ? static_cast(dy[o * inner_size * stride + i * stride + s]) + : 0; + double px = static_cast(x[o * inner_size * stride + i * stride + s]); + + sum_dw += pdy * (px - pmean) * prstd; + sum_db += pdy; + }); + }); + + ref_dw[i] = sum_dw; + ref_db[i] = sum_db; + }); +} + +#endif diff --git a/projects/miopen/miopen_utils/include/miopen_utils/cpu_reduce_util.hpp b/projects/miopen/miopen_utils/include/miopen_utils/cpu_reduce_util.hpp new file mode 100644 index 000000000000..e5f7d50f9d0b --- /dev/null +++ b/projects/miopen/miopen_utils/include/miopen_utils/cpu_reduce_util.hpp @@ -0,0 +1,649 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#ifndef GUARD_CPU_REDUCE_UTIL_HPP +#define GUARD_CPU_REDUCE_UTIL_HPP + +#include "miopen/reducetensor.hpp" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace reduce { + +template +static inline bool float_equal_one(T); + +static inline bool float_equal_one(float x) { return x == 1.0f; }; + +static inline bool float_equal_one(double x) { return x == 1.0; }; + +static inline bool float_equal_one(half_float::half x) +{ + return x == convert_type(1.0f); +}; + +template +static inline bool float_equal_zero(T x); + +static inline bool float_equal_zero(float x) { return x == 0.0f; }; + +static inline bool float_equal_zero(double x) { return x == 0.0; }; + +static inline bool float_equal_zero(half_float::half x) +{ + return x == convert_type(0.0f); +}; + +template +static inline void build_radix(const std::vector& lens, std::vector& radix) +{ + const std::size_t D = lens.size(); + radix.assign(D, 1); + for(std::size_t d = D; d-- > 1;) + radix[d - 1] = radix[d] * static_cast(lens[d]); // radix[d] = Π_{k>d} lens[k] +} + +// i -> memory offset using lens-radix + actual strides +template +static inline std::size_t linear_to_offset_by_lens_strides(std::size_t i, + const std::vector& lens, + const std::vector& radix, + const std::vector& strides) +{ + std::size_t off = 0; + for(std::size_t d = 0; d < lens.size(); ++d) + { + const std::size_t idx_d = (i / radix[d]) % static_cast(lens[d]); + off += idx_d * static_cast(strides[d]); + } + return off; +} + +template +static inline std::function PreUnaryOpFn(miopenReduceTensorOp_t op_, std::size_t) +{ + using std::abs; + + switch(op_) + { + case MIOPEN_REDUCE_TENSOR_NORM1: return ([&](compType& a_) { a_ = abs(a_); }); + case MIOPEN_REDUCE_TENSOR_NORM2: return ([&](compType& a_) { a_ = a_ * a_; }); + case MIOPEN_REDUCE_TENSOR_AMAX: return ([&](compType& a_) { a_ = abs(a_); }); + + case MIOPEN_REDUCE_TENSOR_AVG: + case MIOPEN_REDUCE_TENSOR_ADD: + case MIOPEN_REDUCE_TENSOR_MUL: + case MIOPEN_REDUCE_TENSOR_MIN: + case MIOPEN_REDUCE_TENSOR_MAX: return ([&](compType&) {}); + } + + throw std::runtime_error(std::string(__FUNCTION__) + + ": using undefined Reduction operation is not permitted"); +}; + +template +static inline std::function PosUnaryOpFn(miopenReduceTensorOp_t op_, + std::size_t divider) +{ + using std::sqrt; + + switch(op_) + { + case MIOPEN_REDUCE_TENSOR_NORM2: return ([&](compType& a_) { a_ = sqrt(a_); }); + + case MIOPEN_REDUCE_TENSOR_AVG: + return ([&, divider](compType& a_) { + a_ = a_ / convert_type(static_cast(divider)); + }); + + case MIOPEN_REDUCE_TENSOR_ADD: + case MIOPEN_REDUCE_TENSOR_NORM1: + case MIOPEN_REDUCE_TENSOR_MUL: + case MIOPEN_REDUCE_TENSOR_MIN: + case MIOPEN_REDUCE_TENSOR_MAX: + case MIOPEN_REDUCE_TENSOR_AMAX: return ([&](compType&) {}); + } + + throw std::runtime_error(std::string(__FUNCTION__) + + ": using undefined Reduction operation is not permitted"); +}; + +template +static inline std::function ReduceOpFn(miopenReduceTensorOp_t op_) +{ + switch(op_) + { + case MIOPEN_REDUCE_TENSOR_ADD: + case MIOPEN_REDUCE_TENSOR_AVG: + case MIOPEN_REDUCE_TENSOR_NORM1: + case MIOPEN_REDUCE_TENSOR_NORM2: return ([&](compType& a_, compType b_) { a_ = a_ + b_; }); + + case MIOPEN_REDUCE_TENSOR_MUL: return ([&](compType& a_, compType b_) { a_ = a_ * b_; }); + + case MIOPEN_REDUCE_TENSOR_MIN: + return ([&](compType& a_, compType b_) { + if(a_ > b_) + a_ = b_; + }); + + case MIOPEN_REDUCE_TENSOR_MAX: + case MIOPEN_REDUCE_TENSOR_AMAX: + return ([&](compType& a_, compType b_) { + if(a_ < b_) + a_ = b_; + }); + } + + throw std::runtime_error(std::string(__FUNCTION__) + + ": using undefined Reduction operation is not permitted"); +}; + +template +static inline std::function +ReduceOpFn2(miopenReduceTensorOp_t op_) +{ + switch(op_) + { + case MIOPEN_REDUCE_TENSOR_MIN: + return ([&](compType& a_, compType b_, bool& changed) { + if(a_ > b_) + { + a_ = b_; + changed = true; + } + else + { + changed = false; + } + }); + + case MIOPEN_REDUCE_TENSOR_MAX: + case MIOPEN_REDUCE_TENSOR_AMAX: + return ([&](compType& a_, compType b_, bool& changed) { + if(a_ < b_) + { + a_ = b_; + changed = true; + } + else + { + changed = false; + } + }); + + case MIOPEN_REDUCE_TENSOR_ADD: + case MIOPEN_REDUCE_TENSOR_MUL: + case MIOPEN_REDUCE_TENSOR_AVG: + case MIOPEN_REDUCE_TENSOR_NORM1: + case MIOPEN_REDUCE_TENSOR_NORM2: return (std::function{}); + }; + + throw std::runtime_error(std::string(__FUNCTION__) + + ": using undefined Reduction operation is not permitted"); +}; + +template +static inline compType ReduceOpZeroVal(miopenReduceTensorOp_t op_) +{ + switch(op_) + { + case MIOPEN_REDUCE_TENSOR_ADD: + case MIOPEN_REDUCE_TENSOR_AVG: + case MIOPEN_REDUCE_TENSOR_NORM1: + case MIOPEN_REDUCE_TENSOR_NORM2: return (convert_type(0.0f)); + + case MIOPEN_REDUCE_TENSOR_MUL: return (convert_type(1.0f)); + + case MIOPEN_REDUCE_TENSOR_MIN: return (std::numeric_limits::max()); + + case MIOPEN_REDUCE_TENSOR_MAX: return (std::numeric_limits::lowest()); + case MIOPEN_REDUCE_TENSOR_AMAX: return (convert_type(0.0f)); + } + + throw std::runtime_error(std::string(__FUNCTION__) + + ": using undefined Reduction operation is not permitted"); +}; + +template +static inline void binop_with_nan_check(miopenNanPropagation_t nanOpt, + reduceOpT&& opReduce, + compType& accuVal, + compType currVal) +{ + using std::isnan; + + if(nanOpt == MIOPEN_NOT_PROPAGATE_NAN) + { + opReduce(accuVal, currVal); + } + else + { + if(isnan(currVal)) + accuVal = currVal; + else + opReduce(accuVal, currVal); + }; +}; + +template +static inline void binop_with_nan_check2(miopenNanPropagation_t nanOpt, + reduceOpT&& opReduce, + compType& accuVal, + compType currVal, + int& accuIndex, + int currIndex) +{ + using std::isnan; + + if(nanOpt == MIOPEN_NOT_PROPAGATE_NAN) + { + bool changed; + + opReduce(accuVal, currVal, changed); + + if(changed) + accuIndex = currIndex; + } + else + { + if(isnan(currVal)) + { + accuVal = currVal; + accuIndex = currIndex; + } + else + { + bool changed; + + opReduce(accuVal, currVal, changed); + + if(changed) + accuIndex = currIndex; + }; + }; +}; + +}; // end of namespace reduce + +template +std::vector> get_all_indexes(const std::vector& lens) +{ + const std::size_t D = lens.size(); + assert(D > 0); + + std::size_t N = 1; + for(const auto L : lens) + N *= static_cast(L); + + std::vector> out; + out.resize(N); + for(auto& row : out) + row.resize(D); + + std::vector stride(D, 1); + for(std::size_t d = D; d-- > 1;) + stride[d - 1] = stride[d] * static_cast(lens[d]); + + for(std::size_t r = 0; r < N; ++r) + { + for(std::size_t d = 0; d < D; ++d) + out[r][d] = static_cast((r / stride[d]) % static_cast(lens[d])); + } + + return out; +} + +template +static inline T +linear_to_offset(size_t li, const std::vector& lens, const std::vector& strides) +{ + T off = 0; + for(int d = int(lens.size()) - 1; d >= 0; --d) + { + const T idx = li % lens[d]; + li /= lens[d]; + off += idx * strides[d]; + } + return off; +} + +template +T get_offset_from_index(const std::vector& strides, const std::vector& index) +{ + T offset = 0; + + assert(strides.size() == index.size()); + + for(int i = 0; i < index.size(); i++) + offset += strides[i] * index[i]; + + return (offset); +}; + +template +T get_flatten_offset(const std::vector& lengths, const std::vector& index) +{ + T offset = 0; + + assert(lengths.size() == index.size() && !lengths.empty()); + + int len = lengths.size(); + T stride = 1; + + // for len==1, the loop is not executed + for(int i = len - 1; i > 0; i--) + { + offset += stride * index[i]; + + stride *= lengths[i]; + }; + + offset += stride * index[0]; + + return (offset); +}; + +template +struct Reducer +{ + compType acc; + bool withIdx; + int idx; // meaningful only when WithIdx==true + miopenNanPropagation_t nanOpt; + // functors for reduction + decltype(reduce::ReduceOpFn(MIOPEN_REDUCE_TENSOR_ADD)) opNoIdx; + decltype(reduce::ReduceOpFn2(MIOPEN_REDUCE_TENSOR_ADD)) opWithIdx; + + Reducer(miopenNanPropagation_t n, miopenReduceTensorOp_t rop, compType zero, bool useIdx) + : acc(zero), + withIdx(useIdx), + idx(0), + nanOpt(n), + opNoIdx(reduce::ReduceOpFn(rop)), + opWithIdx(reduce::ReduceOpFn2(rop)) + { + } + + inline void step(compType v, int flat_i) + { + if(withIdx) + reduce::binop_with_nan_check2(nanOpt, opWithIdx, acc, v, idx, flat_i); + else + reduce::binop_with_nan_check(nanOpt, opNoIdx, acc, v); + } + + inline void combine(const Reducer& other) + { + if(withIdx) + reduce::binop_with_nan_check2(nanOpt, opWithIdx, acc, other.acc, idx, other.idx); + else + reduce::binop_with_nan_check(nanOpt, opNoIdx, acc, other.acc); + } +}; + +template +std::tuple, tensor> reduce_cpu_common(const miopenReduceTensorOp_t& reduceOp, + const miopenNanPropagation_t& nanOpt, + const std::vector& inLengths, + const std::vector& outLengths, + const std::vector& input, + const std::vector& inStrides, + const std::vector& output, + const std::vector& outStrides, + float alpha, + float beta, + bool parallel, + bool withIdx) +{ + using reduce::convert_type; + using reduce::ReduceOpZeroVal; + + // Partition dims + std::vector invariantDims, toReduceDims; + std::vector invLens, redLens, invStrides_v, redStrides_v; + + for(int i = 0; i < static_cast(inLengths.size()); ++i) + { + if(inLengths[i] == outLengths[i]) + { + invariantDims.push_back(i); + invLens.push_back(inLengths[i]); + invStrides_v.push_back(inStrides[i]); + } + else + { + toReduceDims.push_back(i); + redLens.push_back(inLengths[i]); + redStrides_v.push_back(inStrides[i]); + } + } + + const bool reduceAllDims = invariantDims.empty(); + + // unary ops & zero vals + const compType zeroV = ReduceOpZeroVal(reduceOp); + + // divider = Π reduced dims (or N if reduce-all) + std::size_t divider = 1; + if(reduceAllDims) + divider = std::accumulate( + inLengths.begin(), inLengths.end(), std::size_t{1}, std::multiplies<>()); + else + divider = + std::accumulate(redLens.begin(), redLens.end(), std::size_t{1}, std::multiplies<>()); + + auto PreUnaryOp = reduce::PreUnaryOpFn(reduceOp, divider); + auto PosUnaryOp = reduce::PosUnaryOpFn(reduceOp, divider); + + // outputs + auto res = tensor{outLengths}; + res.data = output; + auto res_indices = tensor{outLengths}; + if(withIdx) + std::fill(res_indices.begin(), res_indices.end(), 0); + + if(reduceAllDims) + { + // Flatten whole tensor + const std::size_t N = divider; // product of all dims + std::vector lens_radix; + reduce::build_radix(inLengths, lens_radix); + + // parallel chunking + std::size_t hw = + std::max(std::size_t{1}, static_cast(std::thread::hardware_concurrency())); + const std::size_t P = std::min(N, hw * 4ul); + const std::size_t chunk = (N + P - 1) / P; + + std::vector> partial; + partial.reserve(P); + for(std::size_t p = 0; p < P; ++p) + partial.emplace_back(nanOpt, reduceOp, zeroV, withIdx); + + auto worker = [&](int p) { + const std::size_t begin = std::size_t(p) * chunk; + const std::size_t end = std::min(begin + chunk, N); + + auto& r = partial[p]; + for(std::size_t i = begin; i < end; ++i) + { + const auto off = + reduce::linear_to_offset_by_lens_strides(i, inLengths, lens_radix, inStrides); + auto v = convert_type(input[off]); + PreUnaryOp(v); + r.step(v, static_cast(i)); // flat index across whole tensor + } + }; + + if(parallel) + { + miopen::par_for(static_cast(P), worker); + } + else + { + for(int p = 0; p < P; ++p) + { + worker(p); + } + } + + // combine + Reducer R(nanOpt, reduceOp, zeroV, withIdx); + for(std::size_t p = 0; p < P; ++p) + R.combine(partial[p]); + + // post + PosUnaryOp(R.acc); + if(alpha != 1.0f) + R.acc *= convert_type(alpha); + if(beta != 0.0f) + R.acc += convert_type(output[0]) * convert_type(beta); + + res.data[0] = convert_type(R.acc); + if(withIdx) + res_indices.data[0] = R.idx; + } + else + { + // Build radices for invariant and reduced subspaces + std::vector invRad, redRad; + reduce::build_radix(invLens, invRad); + reduce::build_radix(redLens, redRad); + + const std::size_t INV = + std::accumulate(invLens.begin(), invLens.end(), std::size_t{1}, std::multiplies<>()); + const std::size_t TR = divider; + + std::size_t hw = + std::max(std::size_t{1}, static_cast(std::thread::hardware_concurrency())); + const std::size_t Te = std::min(hw * 4ul, std::max(1, INV)); + const std::size_t chunk = (INV + Te - 1) / Te; + + auto worker = [&](int t) { + const std::size_t row0 = std::size_t(t) * chunk; + const std::size_t row1 = std::min(row0 + chunk, INV); + + for(std::size_t r = row0; r < row1; ++r) + { + // decode invariant multi-index; compute base offsets + std::size_t tmp = r; + std::size_t base_in_off = 0; + std::size_t base_out_off = 0; + for(std::size_t k = 0; k < invLens.size(); ++k) + { + const std::size_t idx = (tmp / invRad[k]) % invLens[k]; + base_in_off += idx * invStrides_v[k]; + base_out_off += idx * outStrides[invariantDims[k]]; + } + + Reducer R(nanOpt, reduceOp, zeroV, withIdx); + + // iterate reduced subspace + for(std::size_t i = 0; i < TR; ++i) + { + std::size_t tmp2 = i; + std::size_t red_off = 0; + for(std::size_t k = 0; k < redLens.size(); ++k) + { + const std::size_t idx = (tmp2 / redRad[k]) % redLens[k]; + red_off += idx * redStrides_v[k]; + } + + auto v = convert_type(input[base_in_off + red_off]); + PreUnaryOp(v); + R.step(v, static_cast(i)); // flat index inside reduced subspace + } + + PosUnaryOp(R.acc); + if(alpha != 1.0f) + R.acc *= convert_type(alpha); + if(beta != 0.0f) + R.acc += + convert_type(output[base_out_off]) * convert_type(beta); + + res.data[base_out_off] = convert_type(R.acc); + if(withIdx) + res_indices.data[base_out_off] = R.idx; + } + }; + + if(parallel) + { + miopen::par_for(static_cast(Te), worker); + } + else + { + for(int te = 0; te < Te; ++te) + { + worker(te); + } + } + } + + return {res, res_indices}; +} + +template +std::tuple, tensor> +reduce_cpu_common(const miopen::ReduceTensorDescriptor& reduceDesc, + const tensor& input, + const tensor& output, + float alpha, + float beta, + bool parallel, + bool withIdx) +{ + auto inLengths = input.desc.GetLengths(); + auto outLengths = output.desc.GetLengths(); + auto inStrides = input.desc.GetStrides(); + auto outStrides = output.desc.GetStrides(); + + const auto reduceOp = reduceDesc.reduceTensorOp_; + const auto nanOpt = reduceDesc.reduceTensorNanOpt_; + + return reduce_cpu_common(reduceOp, + nanOpt, + inLengths, + outLengths, + input.data, + inStrides, + output.data, + outStrides, + alpha, + beta, + parallel, + withIdx); +} + +#endif diff --git a/projects/miopen/miopen_utils/include/miopen_utils/fusionHost.hpp b/projects/miopen/miopen_utils/include/miopen_utils/fusionHost.hpp new file mode 100644 index 000000000000..2d1d33cc898a --- /dev/null +++ b/projects/miopen/miopen_utils/include/miopen_utils/fusionHost.hpp @@ -0,0 +1,993 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2018 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#pragma once +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +template +void convHostForward(const tensor& input, + tensor& output, + const tensor& weights, + const int bias_mode, + const tensor& bias, + const miopenConvolutionDescriptor_t convDesc) +{ + + int in_n, in_c, in_h, in_w; + int in_nstride, in_cstride, in_hstride, in_wstride; + std::tie(in_n, in_c, in_h, in_w) = miopen::tien<4>(input.desc.GetLengths()); + std::tie(in_nstride, in_cstride, in_hstride, in_wstride) = + miopen::tien<4>(input.desc.GetStrides()); + + int wei_n, wei_c, wei_h, wei_w; + int wei_nstride, wei_cstride, wei_hstride, wei_wstride; + std::tie(wei_n, wei_c, wei_h, wei_w) = miopen::tien<4>(weights.desc.GetLengths()); + std::tie(wei_nstride, wei_cstride, wei_hstride, wei_wstride) = + miopen::tien<4>(weights.desc.GetStrides()); + + int out_n, out_c, out_h, out_w; + int out_nstride, out_cstride, out_hstride, out_wstride; + std::tie(out_n, out_c, out_h, out_w) = miopen::tien<4>(output.desc.GetLengths()); + std::tie(out_nstride, out_cstride, out_hstride, out_wstride) = + miopen::tien<4>(output.desc.GetStrides()); + + int stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w; + miopenConvolutionMode_t mode; + miopenPaddingMode_t pmode = miopen::deref(convDesc).paddingMode; + miopenGetConvolutionDescriptor( + convDesc, &mode, &pad_h, &pad_w, &stride_h, &stride_w, &dilation_h, &dilation_w); + + if(pmode == miopenPaddingSame) + { + pad_h = (in_h % stride_h == 0) ? (std::max((wei_h - stride_h), 0)) + : (std::max((wei_h - (in_h % stride_h)), 0)); + pad_w = (in_w % stride_w == 0) ? (std::max((wei_w - stride_w), 0)) + : (std::max((wei_w - (in_w % stride_w)), 0)); + pad_h /= 2; + pad_w /= 2; + } + else if(pmode == miopenPaddingValid) + { + pad_h = 0; + pad_w = 0; + } + + if(out_h <= 0 || out_w <= 0) + MIOPEN_THROW("Invalid Test Case: Check Output Dimension."); + + for(int o = 0; o < out_n; o++) + { // mini-batch size + for(int w = 0; w < out_c; w++) + { // out_channels (num filters) + for(int i = 0; i < out_h; i++) + { // output_height (from getforwardoutputdim()) + int in_off_h = i * stride_h; + for(int j = 0; j < out_w; j++) + { // output_width (from getforwardoutputdim()) + /*auto acc = static_cast(0.);*/ + auto acc = static_cast(0.); + int in_off_w = j * stride_w; + for(int k = 0; k < in_c; k++) + { // in_channels (RGB) + for(int x = 0; x < wei_h; x++) + { + int in_x = in_off_h - pad_h + x * dilation_h; + if(in_x >= 0 && in_x < in_h) + { + for(int y = 0; y < wei_w; y++) + { + int in_y = in_off_w - pad_w + y * dilation_w; + if(in_y >= 0 && in_y < in_w) + { + acc += double( + static_cast(input[o * in_nstride + k * in_cstride + + in_x * in_w + in_y]) * + static_cast(weights(w, k, x, y))); + } + } + } + } + } + acc = bias_mode != 0 ? acc + static_cast(bias[w]) : acc; + output[o * out_nstride + w * out_cstride + i * out_hstride + j] = + static_cast(acc); + } + } + } + } +} + +template +void batchNormSpatialHostInference(const tensor& input, + tensor& output, + const tensor& scale, + const tensor& bias, + double epsilon, + const tensor& estimatedMean, + const tensor& estimatedVariance, + bool useInverseVariance = false) +{ + + int n_batches, channels, height, width; + std::tie(n_batches, channels, height, width) = miopen::tien<4>(input.desc.GetLengths()); + miopen::par_for(channels, 1, [&](int cidx) { // via channel + V mean = estimatedMean(0, cidx, 0, 0); + V variance = estimatedVariance(0, cidx, 0, 0); + double invertVar = + useInverseVariance ? static_cast(variance) : 1.0 / sqrt(variance + epsilon); + // process the batch per channel + for(int row = 0; row < height; row++) + { // via rows + for(int column = 0; column < width; column++) + { // via columns + for(int bidx = 0; bidx < n_batches; bidx++) + { // via mini_batch + double elemStd = static_cast(input(bidx, cidx, row, column)) - mean; + double inhat = elemStd * invertVar; + output(bidx, cidx, row, column) = + static_cast(scale(0, cidx, 0, 0) * inhat + bias(0, cidx, 0, 0)); + // printf("output: %f\n",scale(0, cidx, 0, 0) * inhat + bias(0, cidx, 0, 0)); + } + } + } + }); +} + +template +void batchNormPerActivHostInference(const tensor& input, + tensor& output, + const tensor& scale, + const tensor& bias, + double epsilon, + const tensor& estimatedMean, + const tensor& estimatedVariance, + bool useInverseVariance = false) +{ + int n_batches, channels, height, width; + std::tie(n_batches, channels, height, width) = miopen::tien<4>(input.desc.GetLengths()); + miopen::par_for(channels, 1, [&](int cidx) { // via channel + for(int row = 0; row < height; row++) + { // via rows + for(int column = 0; column < width; column++) + { // via columns + // apply down the n_batch dimension + double mean = estimatedMean(0, cidx, row, column); + double variance = estimatedVariance(0, cidx, row, column); + double elemInvVar = useInverseVariance ? variance : 1.0 / sqrt(variance + epsilon); + for(int bidx = 0; bidx < n_batches; bidx++) + { // via mini_batch + // per (x-dims) channel load a block of data into LDS + double elemStd = input(bidx, cidx, row, column) - mean; + double inhat = elemStd * elemInvVar; + output(bidx, cidx, row, column) = + scale(0, cidx, row, column) * inhat + bias(0, cidx, row, column); + // printf("output: %f\n",output(bidx, cidx, row, column)); + } + } + } + }); +} + +template +void batchNormSpatialHostFwdTrain(const tensor& input, + tensor& out, + const tensor& scale, + const tensor& bias, + double epsilon, + double expAvgFactor, + tensor& saveMean, + tensor& saveInvVar, + tensor& runMean, + tensor& runVar) +{ + + int height, width, n_batch, channels; + std::tie(n_batch, channels, height, width) = miopen::tien<4>(input.desc.GetLengths()); + const auto nhw = double(height * width * n_batch); + + miopen::par_for(channels, 1, [&](int cidx) { + double elemStd = 0.; + double variance_accum = 0.; + double mean_accum = 0.; + double invVar = 0.; + double newRunMean = 0.; + double adjust = 0.; + + // process the batch per channel + for(int bidx = 0; bidx < n_batch; bidx++) + { // via mini_batch + for(int row = 0; row < height; row++) + { // via rows + for(int column = 0; column < width; column++) + { // via columns + // #1 calculate the mean + // iterating through the stack of images in the mini_batch + auto inval = static_cast(input(bidx, cidx, row, column)); + mean_accum += inval; + variance_accum += inval * inval; + } // end for (column) + } // end for (row) + } // end for (n) + + mean_accum /= nhw; + variance_accum /= nhw; + variance_accum += (-mean_accum * mean_accum); + invVar = 1.0 / sqrt(variance_accum + epsilon); + + // #4 apply the normalization + // x_hat = (x_i - mean) / sqrt(variance_accum + epsilon) + for(int bidx = 0; bidx < n_batch; bidx++) + { // via mini_batch + for(int row = 0; row < height; row++) + { // via rows + for(int column = 0; column < width; column++) + { // via columns + // #5 Gamma and Beta adjust + // y_i = gamma*x_hat + beta + elemStd = (static_cast(input(bidx, cidx, row, column)) - + mean_accum); // (x_i - mean) + out(bidx, cidx, row, column) = static_cast( + scale(0, cidx, 0, 0) * (invVar * elemStd) + bias(0, cidx, 0, 0)); + } // for (column) + } // for (row) + } // end for(n_batchs) + if(!saveMean.data.empty()) + { + saveMean(0, cidx, 0, 0) = mean_accum; + saveInvVar(0, cidx, 0, 0) = invVar; + } + if(!runMean.data.empty()) + { + newRunMean = runMean(0, cidx, 0, 0) * (1 - expAvgFactor); + runMean(0, cidx, 0, 0) = mean_accum * expAvgFactor + newRunMean; // newMean*factor + tmp + // var(n+1) = p * var(n-1) + (1 - p)*(b/b-1)*var(n) + adjust = (n_batch * height * width == 1) ? variance_accum + : (nhw / (nhw - 1)) * variance_accum; + runVar(0, cidx, 0, 0) = + (1 - expAvgFactor) * runVar(0, cidx, 0, 0) + expAvgFactor * adjust; + } + }); +} + +template +void batchNormSpatialHostBwdTrain(const tensor& x_input, + tensor& dy_input, + tensor& dx_out, + const tensor& bnScale, + const tensor& bnBias, + tensor& dscale, + tensor& dbias, + const tensor& savedMean, + const tensor& savedInvVar, + miopenActivationMode_t activ_mode, + double activ_beta, + double activ_alpha) +{ + double activ_gamma = 0.; + int height, width, n_batch, channels; + std::tie(n_batch, channels, height, width) = miopen::tien<4>(x_input.desc.GetLengths()); + auto nhw = double(height * width * n_batch); + int in_cstride = height * width; + + if(activ_mode > 0) + { + tensor input_norm = + tensor{x_input.desc.GetLayout_t(), x_input.desc.GetLengths()}; + miopen::par_for(channels, 1, [&](int cidx) { + double mean = 0.0; + double invVar = 0.0; + double elemStd = 0.; + double mean_accum = 0.0; + double variance_accum = 0.0; + if(!savedMean.data.empty()) + { + mean = static_cast(savedMean(0, cidx, 0, 0)); // HxW elements + invVar = static_cast(savedInvVar(0, cidx, 0, 0)); // HxW elements + } + else + { + for(int row = 0; row < height; row++) + { // via rows + for(int column = 0; column < width; column++) + { // via columns + for(int bidx = 0; bidx < n_batch; bidx++) + { // via mini_batch + auto inval = static_cast(x_input(bidx, cidx, row, column)); + mean_accum += inval; + variance_accum += inval * inval; + } + } + } + mean_accum /= nhw; + variance_accum /= nhw; + variance_accum += (-mean_accum * mean_accum); + mean = mean_accum; + invVar = 1.0 / sqrt(variance_accum); + } + for(int row = 0; row < height; row++) + { // via rows + for(int column = 0; column < width; column++) + { // via columns + for(int bidx = 0; bidx < n_batch; bidx++) + { // via mini_batch + elemStd = static_cast(x_input(bidx, cidx, row, column)) - + mean; // (x_i - mean) + input_norm(bidx, cidx, row, column) = static_cast( + bnScale(0, cidx, 0, 0) * (elemStd * invVar) + bnBias(0, cidx, 0, 0)); + } + } + } + }); + + activationHostBnormBwd(activ_mode, + activ_gamma, + activ_beta, + activ_alpha, + dy_input.data, + input_norm.data, + dy_input.data); + } + + miopen::par_for(channels, 1, [&](int cidx) { + double elemStd = 0.; + unsigned int xhat_index; + double mean = 0.0; + double invVar = 0.0; + double dyelem = 0.; + std::vector xhat(static_cast(n_batch) * in_cstride, 0.0); + // process the batch per channel + dscale(0, cidx, 0, 0) = 0.; + dbias(0, cidx, 0, 0) = 0.; + + if(!savedMean.data.empty()) + { + + mean = savedMean(0, cidx, 0, 0); // HxW elements + invVar = savedInvVar(0, cidx, 0, 0); // HxW elements + } + else + { + double variance_accum = 0.; + double mean_accum = 0.; + double inv_Var = 0.; + + // process the batch per channel + for(int bidx = 0; bidx < n_batch; bidx++) + { // via mini_batch + for(int row = 0; row < height; row++) + { // via rows + for(int column = 0; column < width; column++) + { // via columns + // #1 calculate the mean + // iterating through the stack of images in the mini_batch + auto inval = static_cast(x_input(bidx, cidx, row, column)); + mean_accum += inval; + variance_accum += inval * inval; + } // end for (column) + } // end for (row) + } // end for (n) + + mean_accum /= nhw; + variance_accum /= nhw; + variance_accum += (-mean_accum * mean_accum); + inv_Var = 1.0 / sqrt(variance_accum); + + mean = mean_accum; + invVar = inv_Var; + } + for(int row = 0; row < height; row++) + { // via rows + for(int column = 0; column < width; column++) + { // via columns + for(int bidx = 0; bidx < n_batch; bidx++) + { // via mini_batch + xhat_index = in_cstride * bidx + (width * row + column); + // per (x-dims) channel load a block of data into LDS + elemStd = static_cast(x_input(bidx, cidx, row, column)) - + mean; // (x_i - mean) + xhat[xhat_index] = elemStd * invVar; + dyelem = static_cast(dy_input(bidx, cidx, row, column)); + dbias(0, cidx, 0, 0) += dyelem; + dscale(0, cidx, 0, 0) += xhat[xhat_index] * dyelem; + } // end for(n_batch) + } // for (column) + } // for (row) + + for(int row = 0; row < height; row++) + { // via rows + for(int column = 0; column < width; column++) + { // via columns + for(int bidx = 0; bidx < n_batch; bidx++) + { // via mini_batch + xhat_index = in_cstride * bidx + (width * row + column); + + double tmp1 = nhw * dy_input(bidx, cidx, row, column) - dbias(0, cidx, 0, 0); + double tmp2 = -xhat[xhat_index] * dscale(0, cidx, 0, 0); + double tmp3 = (bnScale(0, cidx, 0, 0) * invVar) / nhw; + dx_out(bidx, cidx, row, column) = + static_cast(tmp3 * (tmp2 + tmp1)); + } // end for(n_batchs) + } // for (column) + } // for (row) + }); // for (channel) +} + +template +void batchNormActivSpatialHostBwdTrain(miopenActivationMode_t activMode, + double gamma, + double beta, + double alpha, + const tensor& x_input, + const tensor& dy_input, + const tensor& y_input, + tensor& dx_out, + const tensor& bnScale, + const tensor& bias, + tensor& dscale, + tensor& dbias, + const tensor& savedMean, + const tensor& savedInvVar) +{ + + int height, width, n_batch, channels; + std::tie(n_batch, channels, height, width) = miopen::tien<4>(x_input.desc.GetLengths()); + auto nhw = double(height * width * n_batch); + int in_cstride = height * width; + + miopen::par_for(channels, 1, [&](int cidx) { + double elemStd = 0.; + unsigned int xhat_index; + double mean = static_cast(savedMean(0, cidx, 0, 0)); // HxW elements + double invVar = static_cast(savedInvVar(0, cidx, 0, 0)); // HxW elements + double dyelem = 0.; + std::vector xhat(static_cast(n_batch) * in_cstride, 0.0); + // process the batch per channel + dscale(0, cidx, 0, 0) = 0.; + dbias(0, cidx, 0, 0) = 0.; + + for(int row = 0; row < height; row++) + { // via rows + for(int column = 0; column < width; column++) + { // via columns + for(int bidx = 0; bidx < n_batch; bidx++) + { // via mini_batch + + // recompute forward batch norm + xhat_index = in_cstride * bidx + (width * row + column); + // per (x-dims) channel load a block of data into LDS + elemStd = static_cast(x_input(bidx, cidx, row, column)) - + mean; // (x_i - mean) + xhat[xhat_index] = elemStd * invVar; + double bnrefowd = + bnScale(0, cidx, 0, 0) * xhat[xhat_index] + bias(0, cidx, 0, 0); + activationHostBwdElement(activMode, + gamma, + beta, + alpha, + dy_input(bidx, cidx, row, column), + bnrefowd, + y_input(bidx, cidx, row, column), + dyelem); + dbias(0, cidx, 0, 0) += dyelem; + dscale(0, cidx, 0, 0) += xhat[xhat_index] * dyelem; + } // end for(n_batch) + } // for (column) + } // for (row) + + for(int row = 0; row < height; row++) + { // via rows + for(int column = 0; column < width; column++) + { // via columns + for(int bidx = 0; bidx < n_batch; bidx++) + { // via mini_batch + xhat_index = in_cstride * bidx + (width * row + column); + double bnrefowd = + bnScale(0, cidx, 0, 0) * xhat[xhat_index] + bias(0, cidx, 0, 0); + activationHostBwdElement(activMode, + gamma, + beta, + alpha, + dy_input(bidx, cidx, row, column), + bnrefowd, + y_input(bidx, cidx, row, column), + dyelem); + // double tmp1 = nhw * dy_input(bidx, cidx, row, column) - dbias(0, cidx, 0, 0); + double tmp1 = nhw * dyelem - dbias(0, cidx, 0, 0); + double tmp2 = -xhat[xhat_index] * dscale(0, cidx, 0, 0); + double tmp3 = (bnScale(0, cidx, 0, 0) * invVar) / nhw; + dx_out(bidx, cidx, row, column) = static_cast(tmp3 * (tmp2 + tmp1)); + } // end for(n_batchs) + } // for (column) + } // for (row) + }); // for (channel) +} + +template +void batchNormPerActHostFwdTrain(const tensor& input, + tensor& out, + const tensor& scale, + const tensor& bias, + double epsilon, + double expAvgFactor, + tensor& saveMean, + tensor& saveInvVar, + tensor& runMean, + tensor& runVar) +{ + + int height, width, n_batch, channels; + std::tie(n_batch, channels, height, width) = miopen::tien<4>(input.desc.GetLengths()); + const auto n = double(n_batch); + + miopen::par_for(channels, 1, [&](int cidx) { + double mean_accum = 0.; + double variance_accum = 0.; + double elemStd = 0.; + double elemInvVar = 0.; + double inhat = 0.; + double newRunMean = 0.; + double adjust = 0.; + + // process the batch per channel + for(int row = 0; row < height; row++) + { // via rows + for(int column = 0; column < width; column++) + { // via columns + + mean_accum = 0.; + variance_accum = 0.; + for(int bidx = 0; bidx < n_batch; bidx++) + { // via mini_batch + // #1 calculate the mean :: iterating through the stack of images in the + // mini_batch + auto intval = static_cast(input(bidx, cidx, row, column)); + mean_accum += intval; + variance_accum += intval * intval; + } + mean_accum /= n; + variance_accum /= n; + variance_accum = variance_accum - (mean_accum * mean_accum); + elemInvVar = 1.0 / double(sqrt(variance_accum + epsilon)); + + // #4 apply the normalization :: x_hat = (x_i - mean) / sqrt(variance_accum - + // epsilon) + for(int bidx = 0; bidx < n_batch; bidx++) + { // via mini_batch + elemStd = (input(bidx, cidx, row, column) - mean_accum); // (x_i - mean) + inhat = elemStd * elemInvVar; + // #5 Gamma and Beta adjust :: y_i = gamma*x_hat + beta + out(bidx, cidx, row, column) = static_cast( + scale(0, cidx, row, column) * inhat + bias(0, cidx, row, column)); + } // end for(n_batch) + + if(!runMean.data.empty()) + { + newRunMean = runMean(0, cidx, row, column) * (1.0 - expAvgFactor); + runMean(0, cidx, row, column) = + mean_accum * expAvgFactor + newRunMean; // newMean*factor + tmp + } + // var(n+1) = p * var(n-1) + (1 - p)*(b/b-1)*var(n) + if(!runVar.data.empty()) + { + adjust = (n_batch == 1) ? variance_accum : (n / (n - 1.0)) * variance_accum; + runVar(0, cidx, row, column) = + (1 - expAvgFactor) * runVar(0, cidx, row, column) + expAvgFactor * adjust; + } + if(!saveMean.data.empty() || !saveInvVar.data.empty()) + { + saveMean(0, cidx, row, column) = static_cast(mean_accum); + saveInvVar(0, cidx, row, column) = static_cast(elemInvVar); + } + + } // for (column) + } // for (row) + }); +} + +template +void batchNormPerActHostBwdTrain(const tensor& x_input, + const tensor& dy_input, + tensor& dx_out, + const tensor& scale, + tensor& dscale, + tensor& dbias, + const tensor& savedMean, + const tensor& savedInvVar) +{ + + int height, width, n_batch, channels; + std::tie(n_batch, channels, height, width) = miopen::tien<4>(x_input.desc.GetLengths()); + int in_cstride = height * width; + auto n = double(n_batch); + + miopen::par_for(channels, 1, [&](int cidx) { + double elemStd = 0.; + unsigned int xhat_index; + double mean = 0.; + double elemInvVar = 0.; + double dyelem = 0.; + double dxhat = 0.; + double dxhathat = 0.; + double tmp1 = 0.; + std::vector xhat(static_cast(n_batch) * in_cstride); + + // process the batch per channel + for(int row = 0; row < height; row++) + { // via rows + for(int column = 0; column < width; column++) + { // via columns + dxhat = 0.; + dxhathat = 0.; + + if(!savedMean.data.empty()) + { + mean = savedMean(0, cidx, row, column); // HxW elements + elemInvVar = savedInvVar(0, cidx, row, column); // HxW elements + } + else + { + double variance_accum = 0.; + double mean_accum = 0.; + + // process the batch per channel + for(int bidx = 0; bidx < n_batch; bidx++) + { // via mini_batch + auto inval = static_cast(x_input(bidx, cidx, row, column)); + mean_accum += inval; + variance_accum += inval * inval; + } // end for (n) + + mean_accum /= n; + variance_accum /= n; + variance_accum += (-mean_accum * mean_accum); + + mean = mean_accum; + elemInvVar = 1.0 / sqrt(variance_accum); + } + + for(int bidx = 0; bidx < n_batch; bidx++) + { // via mini_batch + xhat_index = in_cstride * bidx + (width * row + column); + // per (x-dims) channel load a block of data into LDS + elemStd = static_cast(x_input(bidx, cidx, row, column)) - + mean; // (x_i - mean) + xhat[xhat_index] = elemStd * elemInvVar; + dyelem = static_cast(dy_input(bidx, cidx, row, column)); + dbias(0, cidx, row, column) += dyelem; + dscale(0, cidx, row, column) += xhat[xhat_index] * dyelem; + tmp1 = scale(0, cidx, row, column) * dyelem; + dxhat += tmp1; + dxhathat += tmp1 * xhat[xhat_index]; + + } // end for(n_batchs) + + for(int bidx = 0; bidx < n_batch; bidx++) + { // via mini_batch + xhat_index = in_cstride * bidx + (width * row + column); + tmp1 = xhat[xhat_index] * dxhathat + dxhat; + double tmp2 = + n_batch * scale(0, cidx, row, column) * dy_input(bidx, cidx, row, column) - + tmp1; + double tmp3 = elemInvVar / (double(n)); + dx_out(bidx, cidx, row, column) = static_cast(tmp3 * tmp2); + } // end for(n_batchs) + } // for (column) + } // for (row) + }); +} + +template +void batchNormActivPerActHostBwdTrain(miopenActivationMode_t activMode, + double gamma, + double beta, + double alpha, + const tensor& x_input, + const tensor& dy_input, + const tensor& y_input, + tensor& dx_out, + const tensor& scale, + const tensor& bias, + tensor& dscale, + tensor& dbias, + const tensor& savedMean, + const tensor& savedInvVar) +{ + + int height, width, n_batch, channels; + std::tie(n_batch, channels, height, width) = miopen::tien<4>(x_input.desc.GetLengths()); + int in_cstride = height * width; + auto n = double(n_batch); + + miopen::par_for(channels, 1, [&](int cidx) { + double elemStd = 0.; + unsigned int xhat_index; + double mean = 0.; + double elemInvVar = 0.; + double dyelem = 0.; + double dxhat = 0.; + double dxhathat = 0.; + double tmp1 = 0.; + std::vector xhat(static_cast(n_batch) * in_cstride); + + // process the batch per channel + for(int row = 0; row < height; row++) + { // via rows + for(int column = 0; column < width; column++) + { // via columns + dxhat = 0.; + dxhathat = 0.; + + mean = savedMean(0, cidx, row, column); // HxW elements + elemInvVar = savedInvVar(0, cidx, row, column); // HxW elements + + for(int bidx = 0; bidx < n_batch; bidx++) + { // via mini_batch + xhat_index = in_cstride * bidx + (width * row + column); + // per (x-dims) channel load a block of data into LDS + elemStd = static_cast(x_input(bidx, cidx, row, column)) - + mean; // (x_i - mean) + xhat[xhat_index] = elemStd * elemInvVar; + double bnrefowd = + scale(0, cidx, row, column) * xhat[xhat_index] + bias(0, cidx, row, column); + activationHostBwdElement(activMode, + gamma, + beta, + alpha, + dy_input(bidx, cidx, row, column), + bnrefowd, + y_input(bidx, cidx, row, column), + dyelem); + /*dyelem = static_cast(dy_input(bidx, cidx, row, column));*/ + dbias(0, cidx, row, column) += dyelem; + dscale(0, cidx, row, column) += xhat[xhat_index] * dyelem; + tmp1 = scale(0, cidx, row, column) * dyelem; + dxhat += tmp1; + dxhathat += tmp1 * xhat[xhat_index]; + + } // end for(n_batchs) + + for(int bidx = 0; bidx < n_batch; bidx++) + { // via mini_batch + xhat_index = in_cstride * bidx + (width * row + column); + tmp1 = xhat[xhat_index] * dxhathat + dxhat; + double bnrefowd = + scale(0, cidx, row, column) * xhat[xhat_index] + bias(0, cidx, row, column); + activationHostBwdElement(activMode, + gamma, + beta, + alpha, + dy_input(bidx, cidx, row, column), + bnrefowd, + y_input(bidx, cidx, row, column), + dyelem); + double tmp2 = (n_batch * scale(0, cidx, row, column) * dyelem) - tmp1; + double tmp3 = elemInvVar / (double(n)); + dx_out(bidx, cidx, row, column) = static_cast(tmp3 * tmp2); + } // end for(n_batchs) + } // for (column) + } // for (row) + }); +} + +template +void visitActivationHostInfer( + miopenActivationMode_t activMode, double gamma, double beta, double alpha, F f) +{ + switch(activMode) + { + case miopenActivationPASTHRU: // x + f([=](double x) { return x; }); + break; + case miopenActivationLOGISTIC: // 1 / (1 + e^-x) //Sigmoid + f([=](double x) { return (1. / (1. + std::exp(-x))); }); + break; + case miopenActivationTANH: // beta * tanh(alpha * x) + f([=](double x) { return (beta * std::tanh(alpha * x)); }); + break; + case miopenActivationRELU: // max(0, x) + f([=](double x) { return ((x > 0.) ? x : 0.); }); + break; + case miopenActivationSOFTRELU: // log(1 + e^x) // bonomial normal log likelihood + f([=](double x) { + return (x > 0.) ? (x + std::log1p(std::exp(-x))) : (std::log1p(std::exp(x))); + }); + break; + case miopenActivationABS: // abs(x) + f([=](double x) { return (std::fabs(x)); }); + break; + case miopenActivationPOWER: // (alpha + beta * x) ^ gamma + f([=](double x) { + auto v = (alpha + beta * x); + return (v <= std::numeric_limits::epsilon()) ? 0. : pow(v, gamma); + }); + break; + case miopenActivationCLIPPEDRELU: // min(alpha, max(0, x)) + f([=](double x) { return (std::min(alpha, std::max(double(0.), x))); }); + break; + case miopenActivationLEAKYRELU: // alpha * x | x<=0; x | x>0 + f([=](double x) { return ((x > 0.) ? x : x * alpha); }); + break; + case miopenActivationELU: // alpha * (exp(x)-1) | x<=0; x | x>0 + f([=](double x) { return ((x > 0.) ? x : alpha * std::expm1(x)); }); + break; + case miopenActivationCLAMP: // max(alpha, min(beta, x)) + f([=](double x) { return (std::max(alpha, std::min(beta, x))); }); + break; + // default: printf("ERROR: unknown neuron type: %d\n", activMode); break; + } +} + +template +inline void activationHostInfer(miopenActivationMode_t activMode, + double gamma, + double beta, + double alpha, + const std::vector input, + std::vector& output) +{ + visitActivationHostInfer(activMode, gamma, beta, alpha, [&](auto f) { + miopen::par_for(input.size(), 1, [&](int index) { + output[index] = static_cast(f(static_cast(input[index]))); + }); + }); +} + +template +void visitActivationHostBwd( + miopenActivationMode_t activMode, double gamma, double beta, double alpha, F f) +{ + switch(activMode) + { + case miopenActivationPASTHRU: // x + f([=](double dy, double, double) { return dy; }); + break; + case miopenActivationLOGISTIC: // 1 / (1 + e^-x) //Sigmoid + f([=](double dy, double, double y) { return dy * y * (1 - y); }); + break; + case miopenActivationTANH: // beta * tanh(alpha * x) + f([=](double dy, double, double y) { return dy * alpha * (beta - y * y / beta); }); + break; + case miopenActivationRELU: // max(0, x) + f([=](double dy, double x, double) { return (x > 0) ? dy : 0; }); + break; + case miopenActivationSOFTRELU: // log(1 + e^x) // bonomial normal log likelihood + f([=](double dy, double x, double) { + static const double threshold = 50.; + double expval = std::exp(std::min(x, threshold)); + return dy * expval / (expval + 1.0); + }); + break; + case miopenActivationABS: // abs(x) + f([=](double dy, double x, double) { return dy * ((x > 0) ? 1 : -1); }); + break; + case miopenActivationPOWER: // (alpha + beta * x) ^ gamma + f([=](double, double x, double y) { + auto v = alpha + beta * x; + return v <= std::numeric_limits::epsilon() ? 0 : gamma * beta * y / v; + }); + break; + case miopenActivationCLIPPEDRELU: // min(alpha, max(0, x)) + f([=](double dy, double x, double) { return (x > 0 && x <= alpha) ? dy : 0; }); + break; + case miopenActivationLEAKYRELU: // alpha * x | x<=0; x | x>0 + f([=](double dy, double x, double) { return dy * ((x > 0) ? 1 : alpha); }); + break; + case miopenActivationELU: // alpah * (exp(x)-1) | x<=0; x | x>0 + f([=](double dy, double x, double y) { return dy * ((x > 0) ? 1 : y + alpha); }); + break; + case miopenActivationCLAMP: // max(alpha, min(beta, x)) + f([=](double dy, double x, double) { return (x > alpha && x <= beta) ? dy : 0; }); + break; + // default: printf("ERROR: unknown neuron type: %d\n", activMode); break; + } +} + +template +inline void activationHostBnormBwd(miopenActivationMode_t activMode, + double gamma, + double beta, + double alpha, + const std::vector dyinput, + const std::vector xinput, + std::vector& output) +{ + double dummy; + visitActivationHostBwd(activMode, gamma, beta, alpha, [&](auto f) { + miopen::par_for(dyinput.size(), 1, [&](int index) { + output[index] = static_cast( + f(static_cast(dyinput[index]), static_cast(xinput[index]), dummy)); + }); + }); +} + +template +inline void activationHostBwd(miopenActivationMode_t activMode, + double gamma, + double beta, + double alpha, + const std::vector dyinput, + const std::vector xinput, + const std::vector yinput, + std::vector& output) +{ + visitActivationHostBwd(activMode, gamma, beta, alpha, [&](auto f) { + miopen::par_for(dyinput.size(), 1, [&](int index) { + output[index] = static_cast(f(static_cast(dyinput[index]), + static_cast(xinput[index]), + static_cast(yinput[index]))); + }); + }); +} + +inline void activationHostBwdElement(miopenActivationMode_t activMode, + double gamma, + double beta, + double alpha, + const double dyinput, + const double xinput, + const double yinput, + double& output) +{ + visitActivationHostBwd(activMode, gamma, beta, alpha, [&](auto f) { + output = static_cast(f(dyinput, xinput, yinput)); + }); +} + +template +tensor get_output_tensor(const miopen::ConvolutionDescriptor& filter, + const tensor& input, + const tensor& weights) +{ + return tensor{filter.GetForwardOutputTensor(input.desc, weights.desc, miopen_type{})}; +} diff --git a/projects/miopen/miopen_utils/include/miopen_utils/gemm.hpp b/projects/miopen/miopen_utils/include/miopen_utils/gemm.hpp new file mode 100644 index 000000000000..81c38db0fdf3 --- /dev/null +++ b/projects/miopen/miopen_utils/include/miopen_utils/gemm.hpp @@ -0,0 +1,120 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2017 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#ifndef GUARD_GEMM_HPP +#define GUARD_GEMM_HPP + +#include +#include +#include + +/* + A and B rows and cols should be passed as default values (NxM, MxK), independently of + a_transponse/b_transpose flag value + C rows and cols should have correct values based on a_transponse/b_transpose values + A, B, C strides should have corret values based on a_transponse/b_transpose values +*/ +template +void gemm_cpu(const Dtype* a_ptr, + const size_t a_cols, + const size_t a_rows, + const size_t a_stride, + const bool a_transpose, + const Dtype* b_ptr, + const size_t b_cols, + const size_t b_rows, + const size_t b_stride, + const bool b_transpose, + Dtype* c_ptr, + const size_t c_cols, + const size_t c_rows, + const size_t c_stride, + double alpha = 1.0, + double beta = 1.0) +{ + if((!a_transpose && !b_transpose && + ((a_cols != b_rows) || (a_rows != c_rows) || (b_cols != c_cols))) || + (a_transpose && b_transpose && + ((a_rows != b_cols) || (a_cols != c_rows) || (b_rows != c_cols))) || + (a_transpose && !b_transpose && + ((a_rows != b_rows) || (a_cols != c_rows) || (b_cols != c_cols))) || + (!a_transpose && b_transpose && + ((a_cols != b_cols) || (a_rows != c_rows) || (b_rows != c_cols)))) + { + MIOPEN_THROW("MM_CPU_ERROR. Incompatible matrix size:\nA: " + std::to_string(a_rows) + "x" + + std::to_string(a_cols) + " transpose: " + (a_transpose ? "true" : "false") + + "\nB: " + std::to_string(b_rows) + "x" + std::to_string(b_cols) + + " transpose: " + (b_transpose ? "true" : "false") + + "\nC: " + std::to_string(c_rows) + "x" + std::to_string(c_cols) + "\n"); + } + + size_t inner_loop_limit = a_transpose ? a_rows : a_cols; + auto inner_loop = [&](int m, int n) { + double el = 0.0; + if(!a_transpose && !b_transpose) + { + miopen::ford(inner_loop_limit)([&](int k) { + el += static_cast(a_ptr[m * a_stride + k]) * + static_cast(b_ptr[k * b_stride + n]); + }); + } + else if(!a_transpose && b_transpose) + { + miopen::ford(inner_loop_limit)([&](int k) { + el += static_cast(a_ptr[m * a_stride + k]) * + static_cast(b_ptr[n * b_stride + k]); + }); + } + else if(a_transpose && !b_transpose) + { + miopen::ford(inner_loop_limit)([&](int k) { + el += static_cast(a_ptr[k * a_stride + m]) * + static_cast(b_ptr[k * b_stride + n]); + }); + } + else + { + miopen::ford(inner_loop_limit)([&](int k) { + el += static_cast(a_ptr[k * a_stride + m]) * + static_cast(b_ptr[n * b_stride + k]); + }); + } + + c_ptr[m * c_stride + n] = + static_cast(beta * static_cast(c_ptr[m * c_stride + n]) + alpha * el); + }; + + constexpr size_t iter_margin = 1'048'576; // 2^20 + if(c_rows * c_cols * inner_loop_limit > iter_margin) + { + miopen::par_ford(c_rows, c_cols)(inner_loop); + } + else + { + miopen::ford(c_rows, c_cols)(inner_loop); + } +} + +#endif diff --git a/projects/miopen/miopen_utils/include/miopen_utils/network_data.hpp b/projects/miopen/miopen_utils/include/miopen_utils/network_data.hpp new file mode 100644 index 000000000000..987d4dda9929 --- /dev/null +++ b/projects/miopen/miopen_utils/include/miopen_utils/network_data.hpp @@ -0,0 +1,438 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2017 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#ifndef GUARD_MIOPEN_TEST_NETWORK_DATA_HPP +#define GUARD_MIOPEN_TEST_NETWORK_DATA_HPP + +#include +#include +#include +#include + +#ifndef MIOPEN_TEST_DEFAULT_BATCH_SIZE_FACTOR +#define MIOPEN_TEST_DEFAULT_BATCH_SIZE_FACTOR 0 +#endif + +template +inline constexpr T pick_batch_size(T x, T y) +{ + return (y == 0 || y > x) ? 1 : x / y; +} + +// Reduce tests execution time +#define MIOPEN_TESTS_GET_INPUTS_ENABLE_HUGE_TENSORS 1 + +template +inline std::set> get_inputs(T n = MIOPEN_TEST_DEFAULT_BATCH_SIZE_FACTOR) +{ + // clang-format off + return + { + { pick_batch_size(32, n), 1, 14, 14 }, + { pick_batch_size(100, n), 1, 8, 8 }, + { pick_batch_size(256, n), 1, 27, 27 }, +#if MIOPEN_TESTS_GET_INPUTS_ENABLE_HUGE_TENSORS + { pick_batch_size(64, n), 19, 1024,2048}, +#endif + { pick_batch_size(100, n), 3, 32, 32 }, + { pick_batch_size(100, n), 32, 16, 16 }, + { pick_batch_size(100, n), 32, 8, 8 }, + { pick_batch_size(128, n), 256, 12, 12 }, + { pick_batch_size(128, n), 3, 231, 231 }, + { pick_batch_size(128, n), 512, 12, 12 }, + { pick_batch_size(256, n), 256, 13, 13 }, + { pick_batch_size(256, n), 3, 227, 227 }, + { pick_batch_size(256, n), 384, 13, 13 }, + { pick_batch_size(256, n), 96, 27, 27 }, + { pick_batch_size(32, n), 128, 28, 28 }, + { pick_batch_size(32, n), 144, 14, 14 }, + { pick_batch_size(32, n), 192, 28, 28 }, + { pick_batch_size(32, n), 192, 7, 7 }, + { pick_batch_size(32, n), 256, 28, 28 }, + { pick_batch_size(32, n), 3, 224, 224 }, + { pick_batch_size(32, n), 32, 28, 28 }, + { pick_batch_size(32, n), 48, 7, 7 }, + { pick_batch_size(32, n), 480, 128, 256 }, + { pick_batch_size(32, n), 480, 64, 128 }, + { pick_batch_size(32, n), 512, 4, 4 }, + { pick_batch_size(32, n), 512, 64, 128 }, + { pick_batch_size(16, n), 64, 56, 56 }, + { pick_batch_size(32, n), 832, 7, 7 }, + { pick_batch_size(64, n), 128, 56, 56 }, + { pick_batch_size(64, n), 256, 28, 28 }, + { pick_batch_size(64, n), 3, 224, 224 }, + { pick_batch_size(64, n), 512, 28, 28 }, + { pick_batch_size(64, n), 64, 112, 112 }, + { pick_batch_size(32, n), 64, 14, 14 }, + { pick_batch_size(32, n), 192, 14, 14 }, + { pick_batch_size(32, n), 320, 28, 28 }, + { pick_batch_size(32, n), 576, 14, 14 }, + { pick_batch_size(32, n), 576, 4, 4 }, + { pick_batch_size(32, n), 1056, 7, 7 }, + { pick_batch_size(32, n), 2048, 11, 11 }, +#if MIOPEN_TESTS_GET_INPUTS_ENABLE_HUGE_TENSORS + { pick_batch_size(32, n), 16, 2048, 2048 }, + { pick_batch_size(32, n), 16, 3072, 3072 }, + { pick_batch_size(32, n), 16, 4096, 4096 }, +#endif + { 1, 1, 1, 1 } + }; + // clang-format on +} + +template +inline std::set> get_weights(T n = MIOPEN_TEST_DEFAULT_BATCH_SIZE_FACTOR) +{ + // clang-format off + return + { + { pick_batch_size(1024, n),1024, 3, 3 }, + { pick_batch_size(1024, n),512, 3, 3 }, + { pick_batch_size(128, n), 256, 1, 1 }, + { pick_batch_size(128, n), 528, 1, 1 }, + { pick_batch_size(128, n), 96, 3, 3 }, + { pick_batch_size(16, n), 192, 1, 1 }, + { pick_batch_size(224, n), 112, 3, 3 }, + { pick_batch_size(256, n), 96, 5, 5 }, + { pick_batch_size(288, n), 144, 3, 3 }, + { pick_batch_size(48, n), 832, 1, 1 }, + { pick_batch_size(512, n), 256, 3, 3 }, + { pick_batch_size(64, n), 1, 2, 2 }, + { pick_batch_size(64, n), 3, 3, 3 }, + { pick_batch_size(64, n), 3, 7, 7 }, + { pick_batch_size(64, n), 32, 5, 5 }, + { pick_batch_size(64, n), 480, 1, 1 }, + { pick_batch_size(64, n), 64, 1, 1 }, + { pick_batch_size(96, n), 3, 11, 11 }, + { pick_batch_size(192, n), 64, 5, 5 }, + { pick_batch_size(64, n), 64, 3, 3 }, + { pick_batch_size(224, n), 224, 3, 3 }, + { pick_batch_size(224, n), 192, 3, 3 }, + { pick_batch_size(128, n), 320, 1, 1 }, + { pick_batch_size(192, n), 576, 1, 1 }, + { pick_batch_size(128, n), 1056, 1, 1 }, + { pick_batch_size(128, n), 1024, 1, 1 }, + { pick_batch_size(512, n), 2048, 1, 1 } + }; + // clang-format on +} + +template +inline std::set> get_immed_inputs(T n = MIOPEN_TEST_DEFAULT_BATCH_SIZE_FACTOR) +{ + // clang-format off + return + { + { pick_batch_size(32, n), 1, 14, 14 }, + { pick_batch_size(256, n), 1, 27, 27 }, + { pick_batch_size(128, n), 512, 12, 12 }, + { pick_batch_size(256, n), 256, 13, 13 }, + { pick_batch_size(256, n), 3, 227, 227 }, + { pick_batch_size(32, n), 64, 56, 56 }, + { pick_batch_size(32, n), 96, 14, 14 }, + { pick_batch_size(32, n), 96, 28, 28 }, + { pick_batch_size(64, n), 128, 56, 56 }, + { pick_batch_size(64, n), 3, 224, 224 }, + { pick_batch_size(64, n), 256, 14, 14 }, + { 1, 1, 1, 1 } + }; + // clang-format on +} + +template +inline std::set> get_immed_weights(T n = MIOPEN_TEST_DEFAULT_BATCH_SIZE_FACTOR) +{ + // clang-format off + return + { + { pick_batch_size(208, n), 96, 3, 3 }, + { pick_batch_size(24, n), 512, 1, 1 }, + { pick_batch_size(256, n), 128, 3, 3 }, + { pick_batch_size(256, n), 256, 3, 3 }, + { pick_batch_size(256, n), 64, 5, 5 }, + { pick_batch_size(288, n), 144, 3, 3 }, + { pick_batch_size(96, n), 3, 11, 11 }, + { pick_batch_size(32, n), 128, 5, 5 }, + { pick_batch_size(32, n), 128, 1, 1 }, + { pick_batch_size(256, n), 256, 3, 3 }, + { pick_batch_size(512, n), 512, 3, 3 }, + { pick_batch_size(160, n), 128, 3, 3 }, + { pick_batch_size(32, n), 3, 7, 7 } + }; + // clang-format on +} + +template +inline std::set> +get_3d_conv_input_shapes(T n = MIOPEN_TEST_DEFAULT_BATCH_SIZE_FACTOR) +{ + // clang-format off + return + { + { pick_batch_size(128, n), 1, 1, 2, 2}, + { pick_batch_size(128, n), 64, 1, 1, 1}, + { pick_batch_size(128, n), 64, 3, 4, 4}, + { pick_batch_size(352, n), 32, 4, 9, 9}, + { pick_batch_size(192, n), 512, 3, 14, 14}, + { pick_batch_size(352, n), 512, 4, 28, 28}, + { pick_batch_size(256, n), 512, 4, 56, 56}, + { pick_batch_size(192, n), 3, 4, 227, 227}, + { pick_batch_size(128, n), 4, 4, 161, 700} + }; + // clang-format on +} + +template +inline std::set> +get_3d_conv_weight_shapes(T n = MIOPEN_TEST_DEFAULT_BATCH_SIZE_FACTOR) +{ + // clang-format off + return + { + { pick_batch_size( 128, n), 1, 1, 1, 1}, + { pick_batch_size( 352, n), 128, 1, 1, 1}, + { pick_batch_size( 256, n), 128, 1, 1, 1}, + { pick_batch_size( 352, n), 32, 3, 3, 3}, + { pick_batch_size( 352, n), 4, 3, 3, 3}, + { pick_batch_size( 160, n), 4, 3, 5, 5}, + { pick_batch_size( 128, n), 64, 5, 7, 7}, + { pick_batch_size( 192, n), 4, 3, 11, 11}, + { pick_batch_size( 128, n), 1, 3, 1, 7}, + { pick_batch_size( 128, n), 1, 3, 7, 1}, + { pick_batch_size( 128, n), 1, 3, 5, 20} + }; + // clang-format on +} + +template +inline std::set> get_bn_peract_inputs(T n = MIOPEN_TEST_DEFAULT_BATCH_SIZE_FACTOR) +{ + // clang-format off + return + { + { pick_batch_size(32, n), 4, 1024,2048}, //Making this much smaller + { pick_batch_size(100, n), 3, 32, 32 }, + { pick_batch_size(100, n), 32, 8, 8 }, + { pick_batch_size(128, n), 256, 12, 12 }, + { pick_batch_size(256, n), 3, 227, 227 }, + { pick_batch_size(64, n), 64, 112, 112 },//Batch-norm ResNet 152 after this line + { pick_batch_size(256, n), 1024, 14, 14 },// n is from the paper @ 256 + { pick_batch_size(256, n), 2048, 7, 7 }, + { pick_batch_size(256, n), 256, 56, 56 }, + { pick_batch_size(256, n), 256, 14, 14 }, + { pick_batch_size(256, n), 512, 28, 28 }, + { pick_batch_size(256, n), 512, 7, 7 }, + { pick_batch_size(256, n), 64, 112, 112 }, + { pick_batch_size(256, n), 64, 56, 56 },//Batch-norm Inception_v3 after this + { pick_batch_size(32, n), 1024, 1, 1 },// n is from the paper @ 32 + { pick_batch_size(32, n), 128, 14, 14 }, + { pick_batch_size(32, n), 128, 28, 28 }, + { pick_batch_size(32, n), 128, 4, 4 }, + { pick_batch_size(32, n), 128, 7, 7 }, + { pick_batch_size(32, n), 160, 7, 7 }, + { pick_batch_size(32, n), 192, 14, 14 }, + { pick_batch_size(32, n), 192, 56, 56 }, + { pick_batch_size(32, n), 192, 7, 7 }, + { pick_batch_size(32, n), 224, 14, 14 }, + { pick_batch_size(32, n), 256, 7, 7 }, + { pick_batch_size(32, n), 256, 14, 14 }, + { pick_batch_size(32, n), 352, 7, 7 }, + { pick_batch_size(32, n), 64, 112, 112 }, + { pick_batch_size(32, n), 64, 14, 14 }, + { pick_batch_size(32, n), 64, 56, 56 }, + { pick_batch_size(32, n), 96, 28, 28 }, + { pick_batch_size(32, n), 32, 256, 512 }, //Killing this config. Takes way too long on the CPU + { pick_batch_size(32, n), 256, 28, 28 }, + { pick_batch_size(32, n), 3, 224, 224 }, + { pick_batch_size(32, n), 480, 128, 256 }, + { pick_batch_size(32, n), 528, 64, 128 } + }; + // clang-format on +} + +template +inline std::set> get_bn_spatial_inputs(T n = MIOPEN_TEST_DEFAULT_BATCH_SIZE_FACTOR) +{ + // clang-format off + return + { + { pick_batch_size(32, n), 4, 1024,2048}, //Making this much smaller + { pick_batch_size(32, n), 192, 256, 512 }, + { pick_batch_size(32, n), 480, 128, 256 }, + { pick_batch_size(256, n), 3, 227, 227 }, + { pick_batch_size(256, n), 64, 112, 112 }, + { pick_batch_size(512, n), 16, 32, 32 }, + { pick_batch_size(100, n), 32, 8, 8 }, + { pick_batch_size(128, n), 256, 12, 12 }, + { pick_batch_size(256, n), 128, 28, 28 }, + { pick_batch_size(256, n), 2048, 7, 7 }, + { pick_batch_size(256, n), 256, 56, 56 }, + { pick_batch_size(256, n), 256, 14, 14 }, + { pick_batch_size(256, n), 512, 28, 28 }, + { pick_batch_size(256, n), 512, 7, 7 }, + { pick_batch_size(256, n), 64, 56, 56 },//Batch-norm Inception_v3 after this + { pick_batch_size(32, n), 1024, 1, 1 },// n is from the paper @ 32 + { pick_batch_size(32, n), 128, 14, 14 }, + { pick_batch_size(32, n), 128, 4, 4 }, + { pick_batch_size(32, n), 160, 7, 7 }, + { pick_batch_size(32, n), 192, 14, 14 }, + { pick_batch_size(32, n), 192, 56, 56 }, + { pick_batch_size(32, n), 192, 7, 7 }, + { pick_batch_size(32, n), 224, 14, 14 }, + { pick_batch_size(32, n), 256, 7, 7 }, + { pick_batch_size(32, n), 352, 7, 7 }, + { pick_batch_size(32, n), 64, 14, 14 }, + { pick_batch_size(32, n), 64, 28, 28 }, + { pick_batch_size(32, n), 64, 56, 56 }, + { pick_batch_size(32, n), 96, 28, 28 }, + { pick_batch_size(32, n), 192, 256, 512 }, + { pick_batch_size(32, n), 256, 28, 28 }, + { pick_batch_size(32, n), 3, 224, 224 }, + { pick_batch_size(32, n), 480, 128, 256 }, + { pick_batch_size(32, n), 528, 64, 128 }, + { pick_batch_size(770, n), 1, 8, 8 }, + { pick_batch_size(770, n), 1024, 1, 1 }, + { pick_batch_size(152, n), 128, 80, 80 }, + { pick_batch_size(152, n), 256, 20, 20 }, + { pick_batch_size(152, n), 32, 160, 160 }, + { pick_batch_size(152, n), 512, 20, 20 }, + { pick_batch_size(152, n), 64, 160, 160 }, + { pick_batch_size(152, n), 64, 80, 80 }, + { pick_batch_size(256, n), 256, 20, 20 }, + { pick_batch_size(256, n), 512, 20, 20 } + }; + // clang-format on +} + +template +inline std::set> get_3d_bn_peract_inputs(T n = MIOPEN_TEST_DEFAULT_BATCH_SIZE_FACTOR) +{ + // clang-format off + return + { + { pick_batch_size(32, n), 1, 32, 32, 32 }, // 32x32x32 based on VoxNet arch + { pick_batch_size(32, n), 1, 14, 14, 14 }, + { pick_batch_size(32, n), 32, 14, 14, 14 }, + { pick_batch_size(32, n), 32, 12, 12, 12 }, + { pick_batch_size(32, n), 32, 6, 6, 6 }, + { pick_batch_size(256, n), 1, 32, 32, 32 }, // 32x32x32 based on VoxNet arch + { pick_batch_size(256, n), 32, 14, 14, 14 }, + { pick_batch_size(256, n), 32, 12, 12, 12 }, + { pick_batch_size(256, n), 32, 6, 6, 6 }, + { pick_batch_size(512, n), 1, 32, 32, 32 }, // 32x32x32 based on VoxNet arch + { pick_batch_size(512, n), 32, 14, 14, 14 }, + { pick_batch_size(512, n), 32, 12, 12, 12 }, + { pick_batch_size(512, n), 32, 6, 6, 6 }, + { pick_batch_size(32, n), 2, 32, 57, 125 }, // Hand-gesture recognition CVPR 2015 paper High Res Net Path + { pick_batch_size(32, n), 32, 14, 25, 59 }, + { pick_batch_size(32, n), 32, 6, 10, 27 }, + { pick_batch_size(32, n), 32, 4, 6, 11 }, + { pick_batch_size(32, n), 32, 2, 2, 3 }, + { pick_batch_size(32, n), 32, 32, 28, 62 }, // Hand-gesture recognition CVPR 2015 paper Low Res Net Path + { pick_batch_size(32, n), 32, 14, 12, 29 }, + { pick_batch_size(32, n), 32, 6, 4, 12 }, + { pick_batch_size(32, n), 32, 4, 2, 2 }, + { pick_batch_size(16, n), 32, 6, 50, 50 }, // Multi-view 3D convnet + { pick_batch_size(1, n), 3, 8, 240, 320 }, // 3D convet on video + { pick_batch_size(1, n), 3, 16, 240, 320 }, // 3D convet on video + { pick_batch_size(1, n), 3, 8, 128, 171 }, // 3D convet on video + { pick_batch_size(1, n), 3, 16, 128, 171 }, // 3D convet on video + { pick_batch_size(1, n), 3, 8, 112, 112 }, // 3D convet on video + { pick_batch_size(1, n), 3, 16, 112, 112 } // 3D convet on video + }; + + // clang-format on +} + +template +inline std::set> +get_3d_bn_spatial_inputs(T n = MIOPEN_TEST_DEFAULT_BATCH_SIZE_FACTOR) +{ + // clang-format off + return + { + { pick_batch_size(32, n), 1, 32, 32, 32 }, // 32x32x32 based on VoxNet arch + { pick_batch_size(32, n), 1, 14, 14, 14 }, + { pick_batch_size(32, n), 32, 14, 14, 14 }, + { pick_batch_size(32, n), 32, 12, 12, 12 }, + { pick_batch_size(32, n), 32, 6, 6, 6 }, + { pick_batch_size(256, n), 1, 32, 32, 32 }, // 32x32x32 based on VoxNet arch + { pick_batch_size(256, n), 32, 14, 14, 14 }, + { pick_batch_size(256, n), 32, 12, 12, 12 }, + { pick_batch_size(256, n), 32, 6, 6, 6 }, + { pick_batch_size(512, n), 1, 32, 32, 32 }, // 32x32x32 based on VoxNet arch + { pick_batch_size(512, n), 32, 14, 14, 14 }, + { pick_batch_size(512, n), 32, 12, 12, 12 }, + { pick_batch_size(512, n), 32, 6, 6, 6 }, + { pick_batch_size(32, n), 2, 32, 57, 125 }, // Hand-gesture recognition CVPR 2015 paper High Res Net Path + { pick_batch_size(32, n), 32, 14, 25, 59 }, + { pick_batch_size(32, n), 32, 6, 10, 27 }, + { pick_batch_size(32, n), 32, 4, 6, 11 }, + { pick_batch_size(32, n), 32, 2, 2, 3 }, + { pick_batch_size(32, n), 32, 32, 28, 62 }, // Hand-gesture recognition CVPR 2015 paper Low Res Net Path + { pick_batch_size(32, n), 32, 14, 12, 29 }, + { pick_batch_size(32, n), 32, 6, 4, 12 }, + { pick_batch_size(32, n), 32, 4, 2, 2 }, + { pick_batch_size(16, n), 32, 6, 50, 50 }, // Multi-view 3D convnet + { pick_batch_size(1, n), 3, 8, 240, 320 }, // 3D convet on video + { pick_batch_size(1, n), 3, 16, 240, 320 }, // 3D convet on video + { pick_batch_size(1, n), 3, 8, 128, 171 }, // 3D convet on video + { pick_batch_size(1, n), 3, 16, 128, 171 }, // 3D convet on video + { pick_batch_size(1, n), 3, 8, 112, 112 }, // 3D convet on video + { pick_batch_size(1, n), 3, 16, 112, 112 } // 3D convet on video + }; + // clang-format on +} + +template +inline std::vector> get_sub_tensor() +{ + return {{16, 4, 8, 1, 4}, + {2, 4, 8, 8, 4}, + {16, 4, 8, 4}, + {13, 8, 4, 8}, + {3, 8, 7}, + {16, 4, 10}, + {3, 8}, + {16, 4}, + {4}}; +} + +template +inline std::vector> get_tensor_offsets() +{ + static_assert(std::is_signed_v); + return {{0, 0}, {0, 2}, {4, 0}, {5, 7}}; +} + +template +inline std::vector get_tensor_offset() +{ + static_assert(std::is_signed_v); + return {0, 1, 2, 3, 4, 5}; +} + +#endif diff --git a/projects/miopen/miopen_utils/include/miopen_utils/random.hpp b/projects/miopen/miopen_utils/include/miopen_utils/random.hpp new file mode 100644 index 000000000000..63b69ac9875a --- /dev/null +++ b/projects/miopen/miopen_utils/include/miopen_utils/random.hpp @@ -0,0 +1,62 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#ifndef GUARD_MIOPEN_TEST_RANDOM_HPP +#define GUARD_MIOPEN_TEST_RANDOM_HPP + +#include + +namespace prng { +template +inline T gen_descreet_uniform_sign(double scale, int32_t range) +{ + return static_cast(scale * prng::gen_A_to_B(-range + 1, range)); +} + +template +inline T gen_descreet_unsigned(double scale, int32_t range) +{ + return static_cast(scale * static_cast(gen_0_to_B(range))); +} + +} // namespace prng + +// lambda factory +template +auto uniform_signed_initializer(ScaleT scale_arg, RangeT range_arg) +{ + return [=](auto&&...) -> T { + // uniform sign give balance of both negative and positive values + return prng::gen_descreet_uniform_sign(scale_arg, range_arg); + }; +} + +template +auto uniform_unsigned_initializer(ScaleT scale_arg, RangeT range_arg) +{ + return [=](auto&&...) -> T { return prng::gen_descreet_unsigned(scale_arg, range_arg); }; +} + +#endif // GUARD_MIOPEN_TEST_RANDOM_HPP diff --git a/projects/miopen/miopen_utils/include/miopen_utils/rnn_util.hpp b/projects/miopen/miopen_utils/include/miopen_utils/rnn_util.hpp new file mode 100644 index 000000000000..a6569cebb7e6 --- /dev/null +++ b/projects/miopen/miopen_utils/include/miopen_utils/rnn_util.hpp @@ -0,0 +1,305 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2023 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#ifndef MIOPEN_RNN_UTIL_H_ +#define MIOPEN_RNN_UTIL_H_ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include + +// complexity O(NlogN) +inline std::vector GetReverseOrderIndex(const std::vector& base_index) +{ + std::vector reverse_index(base_index.size()); + unsigned next_rev_index = 0; + for(auto id : base_index) + reverse_index[id] = next_rev_index++; + return reverse_index; +}; + +inline std::vector GetSamplesIndexDescendingOrder(const std::vector& unsorted_seq_lens) +{ + const auto sample_count = unsorted_seq_lens.size(); + + std::vector index_v(sample_count); + std::iota(index_v.begin(), index_v.end(), 0); + + auto seq_len_cmp = [&unsorted_seq_lens](unsigned a_id, unsigned b_id) { + return unsorted_seq_lens[a_id] > unsorted_seq_lens[b_id]; + }; + + std::stable_sort(index_v.begin(), index_v.end(), seq_len_cmp); + + return index_v; +} + +template +inline void HiddenTensorReorder(const std::vector& src_array, + std::vector& dst_array, + const std::vector& batch_order, + const std::vector hid_len, + bool is_dst_direct_order) +{ + const size_t copy_size = hid_len[2]; + + const size_t batch_stride = hid_len[2]; + const size_t layer_stride = batch_stride * hid_len[1]; + + for(size_t batch_id = 0; batch_id < hid_len[1]; batch_id++) + { + const auto src_batch_off = + batch_stride * (is_dst_direct_order ? batch_order[batch_id] : batch_id); + const auto dst_batch_off = + batch_stride * (is_dst_direct_order ? batch_id : batch_order[batch_id]); + + for(size_t layer_id = 0; layer_id < hid_len[0]; layer_id++) + { + const auto dst_offset = dst_batch_off + layer_id * layer_stride; + const auto src_offset = src_batch_off + layer_id * layer_stride; + + std::copy(src_array.begin() + src_offset, + src_array.begin() + src_offset + copy_size, + dst_array.begin() + dst_offset); + } + } +} + +inline void createTensorDescArray(std::vector& td, + std::vector& ptd, + const std::vector bs, + const int secondDim, + miopenDataType_t dataType) +{ + + std::transform(bs.begin(), bs.end(), std::back_inserter(td), [&](int x) { + return miopen::TensorDescriptor( + dataType, {static_cast(x), static_cast(secondDim)}); + }); + std::transform(td.begin(), td.end(), std::back_inserter(ptd), [](miopen::TensorDescriptor& x) { + return &x; + }); +} + +inline std::tuple +GetTempPackedBuffersSize(std::vector batchs, int in_vec, int out_vec) +{ + size_t total_batch = std::accumulate(batchs.begin(), batchs.end(), 0ULL); + + size_t in_buff_size = total_batch * in_vec; + size_t out_buff_size = total_batch * out_vec; + return {in_buff_size, out_buff_size}; +} + +inline size_t getSuperTensorSize(const std::vector& bs, + int seqLength, + int inputSize, + int hiddenSize, + int maxPaddingVal, + bool isBidirect, + bool isInput, + bool isPadded) +{ + return (isPadded // + ? static_cast(seqLength) * maxPaddingVal + : std::accumulate(bs.begin(), bs.end(), 0ULL)) // + * (isInput // + ? static_cast(inputSize) + : static_cast(hiddenSize) * (isBidirect ? 2 : 1)); +} + +template +void ChangeDataPadding(const std::vector& src_array, + std::vector& dst_array, + const std::vector& batch_list, + int max_batch, + int sample_size, + bool is_src_packed) +{ + auto seq_len = batch_list.size(); + + auto scr_ptr = &src_array[0]; + auto dst_ptr = &dst_array[0]; + + for(int seq_id = 0; seq_id < seq_len; seq_id++) + { + auto packed_size = batch_list[seq_id] * sample_size; + + std::copy(scr_ptr, scr_ptr + packed_size, dst_ptr); + + if(is_src_packed) + { + dst_ptr += max_batch * sample_size; + scr_ptr += packed_size; + } + else + { + scr_ptr += max_batch * sample_size; + dst_ptr += packed_size; + } + } +} + +// RNN VANILLA configs +inline std::vector get_rnn_num_layers() { return {{1, 3}}; } + +inline std::vector get_rnn_batchSize() { return {{1, 17}}; } + +inline std::vector get_rnn_seq_len() { return {{1, 3, 51}}; } + +inline std::vector get_rnn_vector_len() { return {31}; } + +inline std::vector get_rnn_hidden_size() { return {127}; } + +// LSTM configs +inline std::vector get_lstm_num_layers() { return {{1, 3}}; } + +inline std::vector get_lstm_batchSize() { return {{1, 17}}; } + +inline std::vector get_lstm_seq_len() { return {{1, 25}}; } + +inline std::vector get_lstm_vector_len() { return {17}; } + +inline std::vector get_lstm_hidden_size() { return {67}; } + +// GRU configs +inline std::vector get_gru_num_layers() { return {{1, 3}}; } + +inline std::vector get_gru_batchSize() { return {{1, 17}}; } + +inline std::vector get_gru_seq_len() { return {{1, 23}}; } + +inline std::vector get_gru_vector_len() { return {13}; } + +inline std::vector get_gru_hidden_size() { return {67}; } + +inline std::vector> generate_batchSeq(const int batchSize, const int seqLength) +{ + + static constexpr int modval = 3; + + int currentval = batchSize; + std::vector batchSeq; + batchSeq.reserve(seqLength); + for(int i = 0; i < seqLength; i++) + { + if(i > 0) + { + int nvalue = currentval - prng::gen_0_to_B(modval); + currentval = (nvalue < 1) ? 1 : nvalue; + // printf("current value: %d\n", currentval); + } + // printf("adding a value to batch sequence: %d\n", currentval); + batchSeq.push_back(currentval); + } + return {batchSeq}; +} + +inline int sumvc(const std::vector& x) { return std::accumulate(x.begin(), x.end(), 0); } + +template +inline T activfunc(T x, int actvf) +{ + T alpha = static_cast(1), beta0 = static_cast(0), beta1 = static_cast(1); + if(actvf == 0) + { + return (x > 0) ? x : x * beta0; + } + else if(actvf == 2) + { + return static_cast(1 / (1 + std::exp(-x))); + } + return static_cast(alpha * std::tanh(beta1 * x)); +} + +template +inline T dervactivfunc(T x, int actvf) +{ + if(actvf == 0) + { + return static_cast(x > 0 ? 1 : 0); + } + else if(actvf == 2) + { + return static_cast(std::exp(-x) / (1 + std::exp(-x)) / (1 + std::exp(-x))); + } + + return static_cast(1 / std::cosh(x) / std::cosh(x)); +} + +template +void RNN_mm_cpu_batched(const Dtype* a_ptr, + size_t a_cols, + size_t a_rows, + size_t lda, + size_t a_stride, + int a_flags, + const Dtype* b_ptr, + size_t b_cols, + size_t b_rows, + size_t ldb, + size_t b_stride, + int b_flags, + Dtype* c_ptr, + size_t c_cols, + size_t c_rows, + size_t ldc, + size_t c_stride, + int batchCount, + double alpha, + double beta) +{ + for(int i = 0; i < batchCount; ++i) + { + gemm_cpu(a_ptr + a_stride * i, + a_cols, + a_rows, + lda, + a_flags == 1 ? true : false, + b_ptr + b_stride * i, + b_cols, + b_rows, + ldb, + b_flags == 1 ? true : false, + c_ptr + c_stride * i, + c_cols, + c_rows, + ldc, + alpha, + beta); + } +} + +#endif diff --git a/projects/miopen/miopen_utils/include/miopen_utils/serialize.hpp b/projects/miopen/miopen_utils/include/miopen_utils/serialize.hpp new file mode 100644 index 000000000000..71d3133df063 --- /dev/null +++ b/projects/miopen/miopen_utils/include/miopen_utils/serialize.hpp @@ -0,0 +1,129 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2018 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#ifndef MIOPEN_GUARD_TEST_SERIALIZE_HPP +#define MIOPEN_GUARD_TEST_SERIALIZE_HPP + +#include +#include +#include +#include +#include +#include +#include + +template +struct is_trivial_serializable : std::is_trivially_copy_constructible +{ +}; + +template <> +struct is_trivial_serializable : std::true_type +{ +}; + +template +std::enable_if_t{}> serialize(std::ostream& os, const T& x) +{ + os.write(reinterpret_cast(&x), sizeof(T)); +} + +template +auto serialize(std::ostream& os, + const T& x) -> decltype(x.begin(), x.end(), T(x.begin(), x.end()), void()) +{ + std::size_t n = std::distance(x.begin(), x.end()); + serialize(os, n); + for(auto&& y : x) + serialize(os, y); +} + +template +std::enable_if_t>{}> +serialize(std::ostream& os, const std::tuple& t) +{ + miopen::unpack( + [&](auto&&... xs) { miopen::each_args([&](auto&& x) { serialize(os, x); }, xs...); }, t); +} + +template +std::enable_if_t{}> serialize(std::istream& is, T& x) +{ + is.read(reinterpret_cast(&x), sizeof(T)); +} + +template +std::enable_if_t{}> serialize(std::istream& is, std::vector& x) +{ + std::size_t n; + serialize(is, n); + x.resize(n); + is.read(reinterpret_cast(x.data()), sizeof(T) * n); +} + +template +auto serialize(std::istream& is, + T& x) -> decltype(x.begin(), x.end(), x.assign(x.begin(), x.end()), void()) +{ + using value_type = std::decay_t; + std::size_t n; + serialize(is, n); + std::vector v; + v.reserve(n); + for(std::size_t i = 0; i < n; i++) + { + value_type y; + serialize(is, y); + v.push_back(y); + } + x.assign(v.begin(), v.end()); +} + +template +std::enable_if_t>{}> +serialize(std::istream& is, + // cppcheck-suppress constParameter + std::tuple& t) +{ + miopen::unpack( + [&](auto&&... xs) { miopen::each_args([&](auto&& x) { serialize(is, x); }, xs...); }, t); +} + +template +void load(std::string name, T& x) +{ + std::ifstream is{name.c_str()}; + serialize(is, x); +} + +template +void save(std::string name, const T& x) +{ + std::ofstream os{name.c_str()}; + serialize(os, x); +} + +#endif diff --git a/projects/miopen/miopen_utils/include/miopen_utils/tensor_holder.hpp b/projects/miopen/miopen_utils/include/miopen_utils/tensor_holder.hpp new file mode 100644 index 000000000000..f762f80f280c --- /dev/null +++ b/projects/miopen/miopen_utils/include/miopen_utils/tensor_holder.hpp @@ -0,0 +1,505 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2017 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#ifndef GUARD_TENSOR_HOLDER_HPP +#define GUARD_TENSOR_HOLDER_HPP + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +using half = half_float::half; +using hip_bfloat16 = bfloat16; +#include "../../src/kernels/hip_float8.hpp" +using float8_fnuz = miopen_f8::hip_f8; +using bfloat8_fnuz = miopen_f8::hip_f8; + +#include +#include + +template +void visit_tensor_size(std::size_t n, F f) +{ + switch(n) + { + case 0: { + f(std::integral_constant{}); + break; + } + case 1: { + f(std::integral_constant{}); + break; + } + case 2: { + f(std::integral_constant{}); + break; + } + case 3: { + f(std::integral_constant{}); + break; + } + case 4: { + f(std::integral_constant{}); + break; + } + case 5: { + f(std::integral_constant{}); + break; + } + default: throw std::runtime_error("Unknown tensor size"); + } +} + +template +struct miopen_type; + +template <> +struct miopen_type : std::integral_constant +{ +}; + +template <> +struct miopen_type : std::integral_constant +{ +}; + +template <> +struct miopen_type : std::integral_constant +{ +}; +template <> +struct miopen_type : std::integral_constant +{ +}; + +template <> +struct miopen_type : std::integral_constant +{ +}; + +template <> +struct miopen_type : std::integral_constant +{ +}; + +template <> +struct miopen_type : std::integral_constant +{ +}; + +template <> +struct miopen_type : std::integral_constant +{ +}; + +template <> +struct miopen_type : std::integral_constant +{ +}; + +template <> +struct miopen_type : std::integral_constant +{ +}; + +template <> +struct miopen_type : std::integral_constant +{ +}; + +template <> +struct miopen_type : std::integral_constant +{ +}; + +template +struct tensor +{ + using value_type = T; + miopen::TensorDescriptor desc; + std::vector data; + +#if defined(__clang__) || defined(__GNUG__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wdeprecated-declarations" +#endif + + tensor() : desc(miopen_type{}) {} + +#if defined(__clang__) || defined(__GNUG__) +#pragma GCC diagnostic pop +#endif + + template + tensor(const std::vector& dims) : desc(miopen_type{}, dims), data(desc.GetElementSpace()) + { + } + + template + tensor(const std::vector& dims, const std::vector& strides) + : desc(miopen_type{}, dims, strides), data(desc.GetElementSpace()) + { + assert(dims.size() == strides.size()); + } + + template + tensor(miopenTensorLayout_t layout, const std::vector& dims) + : desc(miopen_type{}, layout, dims), data(desc.GetElementSpace()) + { + } + + template + tensor(miopenTensorLayout_t layout, const std::vector& dims, const std::vector& strides) + : desc(miopen_type{}, layout, dims, strides), data(desc.GetElementSpace()) + { + assert(dims.size() == strides.size()); + } + + tensor(std::size_t n, std::size_t c, std::size_t h, std::size_t w) + : desc(miopen_type{}, {n, c, h, w}), data(n * c * h * w) + { + } + + tensor(miopenTensorLayout_t layout, std::size_t n, std::size_t c, std::size_t h, std::size_t w) + : desc(miopen_type{}, layout, {n, c, h, w}), data(desc.GetElementSpace()) + { + } + + tensor(std::size_t n, std::size_t c, std::size_t d, std::size_t h, std::size_t w) + : desc(miopen_type{}, {n, c, d, h, w}), data(n * c * d * h * w) + { + } + + tensor(std::size_t n) : desc(miopen_type{}, {n}), data(n) {} + + tensor(miopen::TensorDescriptor rhs) : desc(std::move(rhs)) + { + assert(desc.GetType() == miopen_type{} + /// In the driver, T is input tensor type, but output tensor holders + /// are instantiatied with T as well. This leads to false assertion + /// failures when T is INT8 because output type is different. + /// \todo Get rid of this hack when the driver is improved: + || (miopen_type{} == miopenInt8 && desc.GetType() == miopenInt32)); + data.resize(desc.GetElementSpace()); + } + + size_t GetDataByteSize() const { return GetSize() * sizeof(T); } + + size_t GetSize() const { return desc.GetElementSpace(); } + + template + tensor& generate(G g) & + { + if(this->desc.GetVectorLength() > 1) + this->generate_vect_impl(g); + else + this->generate_impl(g); + return *this; + } + + template + tensor&& generate(G g) && + { + if(this->desc.GetVectorLength() > 1) + this->generate_vect_impl(g); + else + this->generate_impl(g); + return std::move(*this); + } + + template + void generate_impl(G g) + { + auto seed = std::accumulate(desc.GetLengths().begin(), + desc.GetLengths().end(), + std::size_t{521288629}, + [](auto x, auto y) { + x ^= x << 1U; + return x ^ y; + }); + seed ^= data.size(); + seed ^= desc.GetLengths().size(); + prng::reset_seed(seed); + auto iterator = data.begin(); + auto assign = [&](T x) { + *iterator = x; + ++iterator; + }; + this->for_each( + miopen::compose(miopen::compose(assign, miopen::cast_to()), std::move(g))); + } + + template + void generate_vect_impl(G g) + { + auto seed = std::accumulate(desc.GetLengths().begin(), + desc.GetLengths().end(), + std::size_t{521288629}, + [](auto x, auto y) { + x ^= x << 1U; + return x ^ y; + }); + seed ^= data.size(); + seed ^= desc.GetLengths().size(); + prng::reset_seed(seed); + auto iterator = data.begin(); + auto vectorLength = desc.GetVectorLength(); + auto assign = [&](T x) { + assert(iterator < data.end()); + // for debugging + for(auto i = 0; i < vectorLength; i++) + { + *(iterator + i) = x; + } + iterator += vectorLength; + }; + this->for_each( + miopen::compose(miopen::compose(assign, miopen::cast_to()), std::move(g))); + } + + template + struct for_each_unpacked + { + Loop loop; + F f; + template + auto operator()(Ts... xs) const -> decltype(f(xs...), void()) + { + loop(xs...)(std::move(f)); + } + + struct any + { + any() {} + template + any(X) + { + } + }; + + [[noreturn]] void operator()(any = {}, + any = {}, + any = {}, + any = {}, + any = {}, + any = {}, + any = {}, + any = {}, + any = {}) const + { + throw std::runtime_error( + "Arguments to for_each do not match tensor size or the function " + + miopen::get_type_name() + " can not be called."); + } + }; + + struct for_each_handler + { + template + void operator()(Self* self, Loop loop, F f, Size size) const + { + auto dims = miopen::tien(self->desc.GetLengths()); + miopen::unpack(for_each_unpacked{loop, std::move(f)}, dims); + } + }; + + template + void for_each(F f) const + { + visit_tensor_size( + desc.GetLengths().size(), + std::bind(for_each_handler{}, this, miopen::ford, std::move(f), std::placeholders::_1)); + } + + template + void par_for_each(F f) const + { + visit_tensor_size( + desc.GetLengths().size(), + std::bind( + for_each_handler{}, this, miopen::par_ford, std::move(f), std::placeholders::_1)); + } + + template + T& operator()(Ts... xs) + { + assert(this->desc.GetIndex(xs...) < data.size()); + return this->data[this->desc.GetIndex(xs...)]; + } + + template + const T& operator()(Ts... xs) const + { + assert(this->desc.GetIndex(xs...) < data.size()); + return this->data[this->desc.GetIndex(xs...)]; + } + + template + const T& operator()(const std::array& multi_id) const + { + auto f = [&](auto... is) { return this->desc.GetIndex(is...); }; + assert(miopen::unpack(f, multi_id) < data.size()); + return this->data[miopen::unpack(f, multi_id)]; + } + + T& operator[](std::size_t i) { return data.at(i); } + + const T& operator[](std::size_t i) const { return data.at(i); } + + typename std::vector::iterator begin() { return data.begin(); } + + typename std::vector::iterator end() { return data.end(); } + + typename std::vector::const_iterator begin() const { return data.begin(); } + + typename std::vector::const_iterator end() const { return data.end(); } + + friend std::ostream& operator<<(std::ostream& stream, const tensor& t) + { + return stream << t.desc; + } + + template + void dump_inner(size_t dim, std::array& coord, Stream& stream) const + { + const auto lengths = this->desc.GetLengths(); + if(lengths.size() == 0) + { + // 0D special case: Just print the one value that we have and return. + stream << (*this)(coord); + } + else if(dim + 1 == lengths.size()) + { + // 1D special case: dump everything on one line + for(size_t i = 0; i < lengths[dim]; ++i) + { + if(i != 0) + stream << ' '; + + coord[dim] = i; + stream << std::setw(4) << (*this)(coord); + } + + stream << '\n'; + } + else + { + if(dim + 2 == lengths.size()) + { + // 2D special case: Also print which 2D slice we are currently printing + // Note: this is not needed for higher dimensions, as they will also pass + // through this branch. + stream << "slice ["; + for(size_t i = 0; i < dim; ++i) + { + stream << coord[i] << ", "; + } + stream << ":, :]\n"; + } + + for(size_t i = 0; i < lengths[dim]; ++i) + { + coord[dim] = i; + this->dump_inner(dim + 1, coord, stream); + } + } + } + + template + void dump(const char* name, Stream& stream = std::cout) const + { + const auto n = this->desc.GetLengths().size(); + stream << "==== " << name << ": " << *this << n << '\n'; + stream.fill(' '); + + const auto flags = stream.flags(); + + visit_tensor_size(n, [&](const auto size) { + constexpr size_t N = decltype(size)::value; + std::array coord; + this->dump_inner(0, coord, stream); + }); + + stream.flags(flags); + } +}; + +template +void serialize(std::istream& s, tensor& x) +{ + std::vector lens; + serialize(s, lens); + std::vector strides; + serialize(s, strides); + x.desc = miopen::TensorDescriptor{miopen_type{}, lens, strides}; + serialize(s, x.data); +} + +template +void serialize(std::ostream& s, const tensor& x) +{ + const auto& lens = x.desc.GetLengths(); + const auto& strides = x.desc.GetStrides(); + serialize(s, lens); + serialize(s, strides); + serialize(s, x.data); +} + +struct tensor_generate +{ + template + Tensor&& operator()(Tensor&& t, G g) const + { + return std::forward(t.generate(g)); + } +}; + +struct tensor_elem_gen_integer +{ + uint64_t max_value = 17; + + template + double operator()(Ts... Xs) const + { + static_assert(sizeof...(Ts) < 6, + "Dimensions in tensor_elem_gen_integer must be less than 6."); + assert(max_value > 0); + std::array left = {{Xs...}}; + std::array right = {{613, 547, 701, 877, 1049}}; + uint64_t dot = + std::inner_product(left.begin(), left.end(), right.begin(), static_cast(173)); + return static_cast(dot % max_value); + } +}; + +#endif diff --git a/projects/miopen/miopen_utils/include/miopen_utils/verify.hpp b/projects/miopen/miopen_utils/include/miopen_utils/verify.hpp new file mode 100644 index 000000000000..81af2afbcf2d --- /dev/null +++ b/projects/miopen/miopen_utils/include/miopen_utils/verify.hpp @@ -0,0 +1,245 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2017 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#ifndef GUARD_VERIFY_HPP +#define GUARD_VERIFY_HPP + +#include +#include +#include +#include +#include +#include +#include +#include +using half = half_float::half; +using hip_bfloat16 = bfloat16; +#include +#include + +namespace miopen { + +// Compute the value of a range +template +using range_value = typename std::decay().begin())>::type; + +struct sum_fn +{ + template + auto operator()(T x, U y) const MIOPEN_RETURNS(x + y); +}; +static constexpr sum_fn sum{}; + +struct max_fn +{ + template + static T id(T x) + { + return x; + } + + template + auto operator()(T x, U y) const MIOPEN_RETURNS(max_fn::id(x > y ? x : y)); +}; +static constexpr max_fn max{}; + +namespace abs_diff_detail { +using std::fabs; +struct fn +{ + template + auto operator()(T x, U y) const MIOPEN_RETURNS(fabs(x - y)); +}; + +} // namespace abs_diff_detail + +static constexpr abs_diff_detail::fn abs_diff{}; + +struct not_finite_fn +{ + template ), bool>::type = false> + bool operator()(T x) const + { + return !std::isfinite(x); + } + + template ::type, half_float::half>), + bool>::type = false> + bool operator()(T x) const + { + return !half_float::isfinite(x); + } + + template ::type, bfloat16>), + bool>::type = false> + bool operator()(T x) const + { + return !std::isfinite(x); // bfloat16 has float() conversion operator + } + + template ), bool>::type = false> + bool operator()(T x) const + { + std::ignore = x; + return false; + } +}; +static constexpr not_finite_fn not_finite{}; + +template +T as(T, U x) +{ + return x; +} + +struct compare_mag_fn +{ + template + bool operator()(T x, U y) const + { + using std::fabs; + return fabs(x) < fabs(y); + } +}; +static constexpr compare_mag_fn compare_mag{}; + +struct square_diff_fn +{ + template + double operator()(T x, U y) const + { + double diff = static_cast(x - y); + return diff * diff; + } +}; +static constexpr square_diff_fn square_diff{}; + +template , bool> = true> +bool equal_values(T const& lhs, T const& rhs) +{ + return lhs == rhs; +} + +template , bool> = true> +bool equal_values(T const& lhs, T const& rhs) +{ + return miopen::float_equal_sentinel(lhs, rhs); +} + +template +bool range_empty(R1&& r1) +{ + return r1.begin() == r1.end(); +} + +template +auto range_distance(R1&& r1) MIOPEN_RETURNS(std::distance(r1.begin(), r1.end())); + +template +bool range_zero(const std::vector& r) +{ + return std::all_of(r.begin(), r.end(), [](T x) { return equal_values(x, T()); }); +} + +template +bool range_zero(const tensor& r) +{ + return range_zero(r.data); +} + +template +T range_product(R1&& r1, R2&& r2, T state, Reducer r, Product p) +{ + return std::inner_product(r1.begin(), r1.end(), r2.begin(), state, r, p); +} + +template +std::size_t mismatch_idx(R1&& r1, R2&& r2, Compare compare) +{ + auto p = std::mismatch(r1.begin(), r1.end(), r2.begin(), compare); + return std::distance(r1.begin(), p.first); +} + +template +int64_t find_idx(R1&& r1, Predicate p) +{ + auto it = std::find_if(r1.begin(), r1.end(), p); + if(it == r1.end()) + return -1; + else + return std::distance(r1.begin(), it); +} + +template +double max_diff(R1&& r1, R2&& r2) +{ + return range_product(r1, r2, 0.0, max, abs_diff); +} + +template +auto max_diff_v2(R1&& r1, R2&& r2) +{ + using T = decltype(r1[0] - r2[0]); + auto abs_diff_func = [](auto x, auto y) { return x > y ? x - y : y - x; }; + // BUG: deduced wrong datatype, half_float bug + if constexpr(std::is_same_v) + return range_product(r1, r2, half_float::half(), max, abs_diff_func); + else + return range_product(r1, r2, T(), max, abs_diff_func); +} + +template +std::size_t mismatch_diff(R1&& r1, R2&& r2, T diff) +{ + return mismatch_idx( + r1, + r2, + std::bind( + float_equal, diff, std::bind(abs_diff, std::placeholders::_1, std::placeholders::_2))); +} + +template +double rms_range(R1&& r1, R2&& r2) +{ + std::size_t n = range_distance(r1); + if(n == range_distance(r2)) + { + if(n == 0) + return 0; + double square_difference = range_product(r1, r2, 0.0, sum_fn{}, square_diff); + double mag1 = static_cast(*std::max_element(r1.begin(), r1.end(), compare_mag)); + double mag2 = static_cast(*std::max_element(r2.begin(), r2.end(), compare_mag)); + double mag = + std::max({std::fabs(mag1), std::fabs(mag2), std::numeric_limits::min()}); + return std::sqrt(square_difference) / (std::sqrt(n) * mag); + } + else + return double(std::numeric_limits>::max()); +} +} // namespace miopen +#endif diff --git a/projects/miopen/test/CMakeLists.txt b/projects/miopen/test/CMakeLists.txt index bef91d0ea871..035f1314fc63 100755 --- a/projects/miopen/test/CMakeLists.txt +++ b/projects/miopen/test/CMakeLists.txt @@ -414,9 +414,9 @@ function(add_test_executable TEST_NAME) endif() # MIOpen_with_plugins ensures CK plugin .so's are built alongside the test if(NOT MIOPEN_EMBED_DB STREQUAL "") - target_link_libraries(${TEST_NAME} MIOpen_with_plugins miopen_data miopen_common_utils) + target_link_libraries(${TEST_NAME} MIOpen_with_plugins miopen_data miopen_common_utils miopen_utils) else() - target_link_libraries(${TEST_NAME} MIOpen_with_plugins miopen_common_utils) + target_link_libraries(${TEST_NAME} MIOpen_with_plugins miopen_common_utils miopen_utils) endif() target_include_directories(${TEST_NAME} PRIVATE ../src/kernels) if(WIN32) diff --git a/projects/miopen/test/cpu_bias.hpp b/projects/miopen/test/cpu_bias.hpp index 9b0c2578feef..4b150035d5c0 100644 --- a/projects/miopen/test/cpu_bias.hpp +++ b/projects/miopen/test/cpu_bias.hpp @@ -1,141 +1,5 @@ -/******************************************************************************* - * - * MIT License - * - * Copyright (c) 2019 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - *******************************************************************************/ +// Forwarding header — implementation moved to miopen_utils. #ifndef GUARD_CPU_BIAS_HPP #define GUARD_CPU_BIAS_HPP - -#include "test.hpp" -#include -#include -#include -#include -#include -#include -#include -#include - -#include "tensor_holder.hpp" -#include -#include - -template -void cpu_bias_forward_impl(tensor& out, const tensor& bias) -{ - assert(out.desc.GetNumDims() == NSpatialDim + 2 and bias.desc.GetNumDims() == NSpatialDim + 2); - assert( - bias.desc.GetLengths()[0] == 1 && bias.desc.GetLengths()[1] == out.desc.GetLengths()[1] && - std::all_of(bias.desc.GetLengths().begin() + 2, bias.desc.GetLengths().end(), [](auto v) { - return v == 1; - })); - - out.par_for_each([&](auto out_n_id, auto out_k_id, auto... out_spatial_id_pack) { - out(out_n_id, out_k_id, out_spatial_id_pack...) = - double(out(out_n_id, out_k_id, out_spatial_id_pack...)) + double(bias.data[out_k_id]); - }); -} - -template -void cpu_bias_backward_data_impl(const tensor& out, tensor& bias) -{ - assert(out.desc.GetNumDims() == NSpatialDim + 2 and bias.desc.GetNumDims() == NSpatialDim + 2); - assert( - bias.desc.GetLengths()[0] == 1 && bias.desc.GetLengths()[1] == out.desc.GetLengths()[0] && - std::all_of(bias.desc.GetLengths().begin() + 2, bias.desc.GetLengths().end(), [](auto v) { - return v == 1; - })); - - std::size_t out_n_len = out.desc.GetLengths()[0]; - std::size_t out_k_len = out.desc.GetLengths()[1]; - - std::array out_spatial_len{}; - std::copy_n(out.desc.GetLengths().begin() + 2, NSpatialDim, out_spatial_len.begin()); - - miopen::par_ford(out_k_len)([&](auto out_k_id) { - auto ford_out_n_spatial = - miopen::unpacker(miopen::prepender(miopen::ford, out_n_len))(out_spatial_len); - - double acc = 0; - ford_out_n_spatial([&](auto out_n_id, auto... out_spatial_id_pack) { - acc += double(out(out_n_id, out_k_id, out_spatial_id_pack...)); - }); - - bias.data[out_k_id] = acc; - }); -} - -template -void cpu_bias_forward(tensor& out, const tensor& bias) -{ - switch(out.desc.GetNumDims()) - { - case 3: { - cpu_bias_forward_impl<1>(out, bias); - break; - } - case 4: { - cpu_bias_forward_impl<2>(out, bias); - break; - } - case 5: { - cpu_bias_forward_impl<3>(out, bias); - break; - } - case 6: { - cpu_bias_forward_impl<4>(out, bias); - break; - } - default: { - MIOPEN_THROW("not belong to any case"); - } - } -} - -template -void cpu_bias_backward_data(const tensor& out, tensor& bias) -{ - switch(out.desc.GetNumDims()) - { - case 3: { - cpu_bias_backward_data_impl<1>(out, bias); - break; - } - case 4: { - cpu_bias_backward_data_impl<2>(out, bias); - break; - } - case 5: { - cpu_bias_backward_data_impl<3>(out, bias); - break; - } - case 6: { - cpu_bias_backward_data_impl<4>(out, bias); - break; - } - default: { - MIOPEN_THROW("not belong to any case"); - } - } -} +#include #endif diff --git a/projects/miopen/test/cpu_conv.hpp b/projects/miopen/test/cpu_conv.hpp index 895262311b12..fac5227efe75 100644 --- a/projects/miopen/test/cpu_conv.hpp +++ b/projects/miopen/test/cpu_conv.hpp @@ -1,515 +1,5 @@ -/******************************************************************************* - * - * MIT License - * - * Copyright (c) 2019 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - *******************************************************************************/ +// Forwarding header — implementation moved to miopen_utils. #ifndef GUARD_CPU_CONV_HPP #define GUARD_CPU_CONV_HPP - -#include "test.hpp" -#include -#include -#include -#include -#include -#include -#include -#include - -#include "tensor_holder.hpp" -#include -#include -#include - -template -static constexpr auto make_array(T x, Ts... xs) -{ - return std::array{{x, xs...}}; -} - -template -struct PassThru -{ - T operator()(T t) { return t; } -}; - -template -struct cpu_convolution_acc_type -{ - using type = double; // default using double as accumulator -}; - -template <> -struct cpu_convolution_acc_type -{ - using type = int32_t; -}; - -template <> -struct cpu_convolution_acc_type -{ - using type = double; -}; - -template -void cpu_convolution_forward_impl(const tensor& in, - const tensor& wei, - tensor& out, - const Range& pads, - const Range& strides, - const Range& dilations, - std::size_t group_count, - FI fi = {}, - FW fw = {}) -{ - static_assert(ConvDim > 0, "wrong! convolution dim should be larger than 0"); - assert(in.desc.GetNumDims() == ConvDim + 2 and wei.desc.GetNumDims() == ConvDim + 2 and - out.desc.GetNumDims() == ConvDim + 2 and pads.size() == ConvDim and - strides.size() == ConvDim and dilations.size() == ConvDim); - std::size_t out_n_len = out.desc.GetLengths()[0]; - - std::size_t wei_k_len = wei.desc.GetLengths()[0]; - std::size_t wei_c_len = wei.desc.GetLengths()[1]; - - std::size_t vector_len = in.desc.GetVectorLength(); - - std::array in_spatial_len{}; - std::array wei_spatial_len{}; - std::array out_spatial_len{}; - - std::copy_n(in.desc.GetLengths().begin() + 2, ConvDim, in_spatial_len.begin()); - std::copy_n(wei.desc.GetLengths().begin() + 2, ConvDim, wei_spatial_len.begin()); - std::copy_n(out.desc.GetLengths().begin() + 2, ConvDim, out_spatial_len.begin()); - - if(wei.desc.GetLayout_str() == "CHWNc") - { - wei_c_len = wei.desc.GetLengths()[0]; - std::copy_n(wei.desc.GetLengths().begin() + 1, ConvDim, wei_spatial_len.begin()); - wei_k_len = wei.desc.GetLengths()[3]; - } - - std::size_t wei_k_len_per_group = wei_k_len / group_count; - - // f(x0, x1, xs...) - // f1(xs...) = f(x0, x1, xs...) - // f2(xs_array) = f1(xs...) - auto par_ford_out_nk_spatial = miopen::unpacker( - miopen::prepender(miopen::par_ford, out_n_len, wei_k_len))(out_spatial_len); - - par_ford_out_nk_spatial([&](std::size_t out_n_id, - std::size_t out_k_id, - auto... out_spatial_id_pack) { - auto out_spatial_id = make_array(out_spatial_id_pack...); - - std::size_t group_id = out_k_id / wei_k_len_per_group; - Tacc acc = 0; - - miopen::ford(wei_c_len)([&](std::size_t wei_c_id) { - std::size_t in_c_id = group_id * wei_c_len + wei_c_id; - - auto ford_wei_spatial = miopen::unpacker(miopen::ford)(wei_spatial_len); - - ford_wei_spatial([&](auto... wei_spatial_id_pack) { - auto wei_spatial_id = make_array(wei_spatial_id_pack...); - - std::array in_spatial_id{}; - - for(std::size_t i = 0; i < ConvDim; ++i) - { - in_spatial_id[i] = - out_spatial_id[i] * strides[i] + wei_spatial_id[i] * dilations[i] - pads[i]; - } - bool out_of_bound = false; - for(std::size_t i = 0; i < ConvDim; ++i) - { - out_of_bound = out_of_bound or - (in_spatial_id[i] < 0 or in_spatial_id[i] >= in_spatial_len[i]); - } - if(!out_of_bound) - { - if(vector_len > 1) - { - std::array in_id{}; - in_id[1] = out_n_id; - in_id[2] = in_c_id; - std::copy_n(in_spatial_id.begin(), ConvDim, in_id.begin() + 3); - for(std::size_t i = 0; i < vector_len; i++) - { - in_id[0] = i; - acc += Tacc(in(in_id)) * - Tacc(wei(i, out_k_id, wei_c_id, wei_spatial_id_pack...)); - } - } - else - { - std::array in_id{}; - in_id[0] = out_n_id; - in_id[1] = in_c_id; - std::copy_n(in_spatial_id.begin(), ConvDim, in_id.begin() + 2); - Tacc tmp1 = static_cast(fi(in(in_id))); - Tacc tmp2 = - static_cast(fw(wei(out_k_id, wei_c_id, wei_spatial_id_pack...))); - acc += tmp1 * tmp2; - } - } - }); - }); - if(vector_len > 1) - { - out(out_k_id % vector_len, out_n_id, out_k_id / vector_len, out_spatial_id_pack...) = - static_cast(acc); - } - else - { - out(out_n_id, out_k_id, out_spatial_id_pack...) = static_cast(acc); - } - }); -} - -template -void cpu_convolution_backward_data_impl(tensor& in, - const tensor& wei, - const tensor& out, - const Range& pads, - const Range& strides, - const Range& dilations, - std::size_t group_count, - FW fw = {}, - FO fo = {}) -{ - static_assert(ConvDim > 0, "wrong! convolution dim should be larger than 0"); - assert(in.desc.GetNumDims() == ConvDim + 2 and wei.desc.GetNumDims() == ConvDim + 2 and - out.desc.GetNumDims() == ConvDim + 2 and pads.size() == ConvDim and - strides.size() == ConvDim and dilations.size() == ConvDim); - - std::size_t in_n_len = in.desc.GetLengths()[0]; - std::size_t in_c_len = in.desc.GetLengths()[1]; - - std::size_t wei_k_len = wei.desc.GetLengths()[0]; - std::size_t wei_c_len = wei.desc.GetLengths()[1]; - - std::size_t wei_k_len_per_group = wei_k_len / group_count; - - std::array in_spatial_len{}; - std::array wei_spatial_len{}; - std::array out_spatial_len{}; - - std::copy_n(in.desc.GetLengths().begin() + 2, ConvDim, in_spatial_len.begin()); - std::copy_n(wei.desc.GetLengths().begin() + 2, ConvDim, wei_spatial_len.begin()); - std::copy_n(out.desc.GetLengths().begin() + 2, ConvDim, out_spatial_len.begin()); - - auto par_ford_in_nc_spatial = - miopen::unpacker(miopen::prepender(miopen::par_ford, in_n_len, in_c_len))(in_spatial_len); - - par_ford_in_nc_spatial( - [&](std::size_t in_n_id, std::size_t in_c_id, auto... in_spatial_id_pack) { - auto in_spatial_id = make_array(in_spatial_id_pack...); - - std::size_t group_id = in_c_id / wei_c_len; - - Tacc acc = 0; - - miopen::ford(wei_k_len_per_group)([&](std::size_t wei_k_id_inside_group) { - auto ford_wei_spatial = miopen::unpacker(miopen::ford)(wei_spatial_len); - - ford_wei_spatial([&](auto... wei_spatial_id_pack) { - auto wei_spatial_id = make_array(wei_spatial_id_pack...); - - std::array out_spatial_id_{}; - std::array out_spatial_id{}; - - for(std::size_t i = 0; i < ConvDim; ++i) - { - out_spatial_id_[i] = - pads[i] + in_spatial_id[i] - wei_spatial_id[i] * dilations[i]; - out_spatial_id[i] = out_spatial_id_[i] / strides[i]; - } - - bool use = true; - for(std::size_t i = 0; i < ConvDim; ++i) - { - use &= out_spatial_id_[i] % strides[i] == 0 and out_spatial_id[i] >= 0 and - out_spatial_id[i] < out_spatial_len[i]; - } - - if(use) - { - std::size_t out_k_id = - group_id * wei_k_len_per_group + wei_k_id_inside_group; - std::size_t wei_c_id = in_c_id % wei_c_len; - - std::array out_id{}; - out_id[0] = in_n_id; - out_id[1] = out_k_id; - std::copy_n(out_spatial_id.begin(), ConvDim, out_id.begin() + 2); - Tacc tmp1 = fo(out(out_id)); - Tacc tmp2 = fw(wei(out_k_id, wei_c_id, wei_spatial_id_pack...)); - acc += tmp1 * tmp2; - } - }); - }); - // TODO: Why do we need a no-lint here ? - in(in_n_id, in_c_id, in_spatial_id_pack...) = static_cast(acc); // NOLINT - }); -} - -template -void cpu_convolution_backward_weight_impl(const tensor& in, - tensor& wei, - const tensor& out, - const Range& pads, - const Range& strides, - const Range& dilations, - std::size_t group_count, - FI fi, - FO fo) -{ - static_assert(ConvDim > 0, "wrong! convolution dim should be larger than 0"); - assert(in.desc.GetNumDims() == ConvDim + 2 and wei.desc.GetNumDims() == ConvDim + 2 and - out.desc.GetNumDims() == ConvDim + 2 and pads.size() == ConvDim and - strides.size() == ConvDim and dilations.size() == ConvDim); - - std::size_t out_n_len = out.desc.GetLengths()[0]; - - std::size_t wei_k_len = wei.desc.GetLengths()[0]; - std::size_t wei_c_len = wei.desc.GetLengths()[1]; - - std::size_t wei_k_len_per_group = wei_k_len / group_count; - - std::array in_spatial_len{}; - std::array wei_spatial_len{}; - std::array out_spatial_len{}; - - std::copy_n(in.desc.GetLengths().begin() + 2, ConvDim, in_spatial_len.begin()); - std::copy_n(wei.desc.GetLengths().begin() + 2, ConvDim, wei_spatial_len.begin()); - std::copy_n(out.desc.GetLengths().begin() + 2, ConvDim, out_spatial_len.begin()); - - auto par_ford_wei_kc_spatial = miopen::unpacker( - miopen::prepender(miopen::par_ford, wei_k_len, wei_c_len))(wei_spatial_len); - - par_ford_wei_kc_spatial( - [&](std::size_t wei_k_id, std::size_t wei_c_id, auto... wei_spatial_id_pack) { - auto wei_spatial_id = make_array(wei_spatial_id_pack...); - - std::size_t group_id = wei_k_id / wei_k_len_per_group; - std::size_t in_c_id = group_id * wei_c_len + wei_c_id; - - Tacc acc = 0; - - miopen::ford(out_n_len)([&](std::size_t out_n_id) { - auto ford_out_spatial = miopen::unpacker(miopen::ford)(out_spatial_len); - - ford_out_spatial([&](auto... out_spatial_id_pack) { - auto out_spatial_id = make_array(out_spatial_id_pack...); - - std::array in_spatial_id{}; - - for(std::size_t i = 0; i < ConvDim; ++i) - { - in_spatial_id[i] = out_spatial_id[i] * strides[i] + - wei_spatial_id[i] * dilations[i] - pads[i]; - } - - bool out_of_bound = false; - for(std::size_t i = 0; i < ConvDim; ++i) - { - out_of_bound = out_of_bound or (in_spatial_id[i] < 0 or - in_spatial_id[i] >= in_spatial_len[i]); - } - - if(!out_of_bound) - { - std::array in_id{}; - in_id[0] = out_n_id; - in_id[1] = in_c_id; - std::copy_n(in_spatial_id.begin(), ConvDim, in_id.begin() + 2); - Tacc tmp1 = fi(in(in_id)); - Tacc tmp2 = fo(out(out_n_id, wei_k_id, out_spatial_id_pack...)); - acc += tmp1 * tmp2; - } - }); - - wei(wei_k_id, wei_c_id, wei_spatial_id_pack...) = static_cast(acc); - }); - }); -} - -template , - typename FW = PassThru> -void cpu_convolution_forward(std::size_t spatial_dim, - const tensor& in, - const tensor& wei, - tensor& out, - const Range& pads, - const Range& strides, - const Range& dilations, - std::size_t group_count, - FI fi = {}, - FW fw = {}) -{ - switch(spatial_dim) - { - case 1: { - cpu_convolution_forward_impl<1, Tacc>( - in, wei, out, pads, strides, dilations, group_count, fi, fw); - break; - } - case 2: { - cpu_convolution_forward_impl<2, Tacc>( - in, wei, out, pads, strides, dilations, group_count, fi, fw); - break; - } - case 3: { - cpu_convolution_forward_impl<3, Tacc>( - in, wei, out, pads, strides, dilations, group_count, fi, fw); - break; - } - case 4: { - cpu_convolution_forward_impl<4, Tacc>( - in, wei, out, pads, strides, dilations, group_count, fi, fw); - break; - } - default: { - MIOPEN_THROW("not belong to any case"); - } - } -} - -template , - typename FO = PassThru> -void cpu_convolution_backward_data(std::size_t spatial_dim, - tensor& in, - const tensor& wei, - const tensor& out, - const Range& pads, - const Range& strides, - const Range& dilations, - std::size_t group_count, - FW fw = {}, - FO fo = {}) -{ - switch(spatial_dim) - { - case 1: { - cpu_convolution_backward_data_impl<1, Tacc>( - in, wei, out, pads, strides, dilations, group_count, fw, fo); - break; - } - case 2: { - cpu_convolution_backward_data_impl<2, Tacc>( - in, wei, out, pads, strides, dilations, group_count, fw, fo); - break; - } - case 3: { - cpu_convolution_backward_data_impl<3, Tacc>( - in, wei, out, pads, strides, dilations, group_count, fw, fo); - break; - } - case 4: { - cpu_convolution_backward_data_impl<4, Tacc>( - in, wei, out, pads, strides, dilations, group_count, fw, fo); - break; - } - default: { - MIOPEN_THROW("not belong to any case"); - } - } -} - -template , - typename FO = PassThru> -void cpu_convolution_backward_weight(std::size_t spatial_dim, - const tensor& in, - tensor& wei, - const tensor& out, - const Range& pads, - const Range& strides, - const Range& dilations, - std::size_t group_count, - FI fi = {}, - FO fo = {}) -{ - switch(spatial_dim) - { - case 1: { - cpu_convolution_backward_weight_impl<1, Tacc>( - in, wei, out, pads, strides, dilations, group_count, fi, fo); - break; - } - case 2: { - cpu_convolution_backward_weight_impl<2, Tacc>( - in, wei, out, pads, strides, dilations, group_count, fi, fo); - break; - } - case 3: { - cpu_convolution_backward_weight_impl<3, Tacc>( - in, wei, out, pads, strides, dilations, group_count, fi, fo); - break; - } - case 4: { - cpu_convolution_backward_weight_impl<4, Tacc>( - in, wei, out, pads, strides, dilations, group_count, fi, fo); - break; - } - default: { - MIOPEN_THROW("not belong to any case"); - } - } -} +#include #endif diff --git a/projects/miopen/test/cpu_layernorm.hpp b/projects/miopen/test/cpu_layernorm.hpp index 8b5bf965deab..9f1c7a55ba42 100644 --- a/projects/miopen/test/cpu_layernorm.hpp +++ b/projects/miopen/test/cpu_layernorm.hpp @@ -1,216 +1,5 @@ -// Copyright © Advanced Micro Devices, Inc., or its affiliates. -// SPDX-License-Identifier: MIT +// Forwarding header — implementation moved to miopen_utils. #ifndef GUARD_CPU_CONV_HPP #define GUARD_CPU_CONV_HPP - -#include <../test/tensor_holder.hpp> - -template -void cpu_layernorm_forward(tensor input, - tensor weight, - tensor bias, - tensor& ref_output, - tensor& ref_mean, - tensor& ref_rstd, - float eps, - int32_t dim, - miopenNormMode_t mode, - bool use_multithread = false) -{ - auto layout = input.desc.GetLayoutEnum(); - size_t stride = 1; - if(dim > 1 && layout.has_value() && - (layout.value() == miopenTensorNHWC || layout.value() == miopenTensorNDHWC)) - { - stride = input.desc.GetLengths()[1]; // stride = C - } - - auto dims = input.desc.GetLengths(); - size_t outer_size = 1; - size_t inner_size = 1; - for(size_t i = 0; i < dims.size(); ++i) - { - if(i < dim) - { - if(!(stride > 1 && i == 1)) - { - outer_size *= dims[i]; - } - } - else - { - inner_size *= dims[i]; - } - } - - size_t min_grain = use_multithread ? 8 : outer_size; - miopen::par_for(outer_size, min_grain, [&](int32_t o) { - miopen::ford(stride)([&](int32_t s) { - double mean_v = 0.0; - double var_v = 0.0; - - miopen::ford(inner_size)([&](int32_t i) { - double tmp = static_cast(input[o * inner_size * stride + i * stride + s]); - mean_v += tmp; - var_v += tmp * tmp; - }); - - mean_v = mean_v / inner_size; - var_v = var_v / inner_size - mean_v * mean_v; - double rstd_v = 1.0 / sqrt(var_v + eps); - - ref_mean[o * stride + s] = static_cast(mean_v); - ref_rstd[o * stride + s] = static_cast(rstd_v); - - miopen::ford(inner_size)([&](int32_t i) { - double weight_v = - (mode == MIOPEN_ELEMENTWISE_AFFINE) ? 1.0 : static_cast(weight[i]); - double bias_v = - (mode == MIOPEN_ELEMENTWISE_AFFINE) ? 0.0 : static_cast(bias[i]); - - ref_output[o * inner_size * stride + i * stride + s] = static_cast( - (static_cast(input[o * inner_size * stride + i * stride + s]) - - mean_v) * - rstd_v * weight_v + - bias_v); - }); - }); - }); -} - -template -void cpu_layernorm_backward(tensor dy, - tensor x, - tensor weight, - tensor mean, - tensor rstd, - tensor& ref_dx, - int32_t dim, - miopenNormMode_t mode, - bool use_multithread = false) -{ - auto layout = dy.desc.GetLayoutEnum(); - size_t stride = 1; - if(dim > 1 && (layout == miopenTensorNHWC || layout == miopenTensorNDHWC)) - { - stride = dy.desc.GetLengths()[1]; // stride = C - } - - auto dims = dy.desc.GetLengths(); - size_t outer_size = 1; - size_t inner_size = 1; - for(size_t i = 0; i < dims.size(); ++i) - { - if(i < dim) - { - if(!(stride > 1 && i == 1)) - { - outer_size *= dims[i]; - } - } - else - { - inner_size *= dims[i]; - } - } - - size_t min_grain = use_multithread ? 8 : outer_size; - miopen::par_for(outer_size, min_grain, [&](int32_t o) { - miopen::ford(stride)([&](int32_t s) { - double sum_dy_weight = 0.0; - double sum_dy_weight_x = 0.0; - - miopen::ford(inner_size)([&](int32_t i) { - double pweight = - (mode == MIOPEN_ELEMENTWISE_AFFINE) ? 1.0 : static_cast(weight[i]); - double pdy = (dy.GetSize() != 0) - ? static_cast(dy[o * inner_size * stride + i * stride + s]) - : 0.0; - double px = static_cast(x[o * inner_size * stride + i * stride + s]); - - sum_dy_weight += pdy * pweight; - sum_dy_weight_x += pdy * px * pweight; - }); - - double scale = 1.0 / static_cast(inner_size); - double prstd = static_cast(rstd[o * stride + s]); - double pmean = static_cast(mean[o * stride + s]); - double a = prstd * prstd * prstd * scale * (sum_dy_weight_x - sum_dy_weight * pmean); - double b = prstd * sum_dy_weight * scale - a * pmean; - - miopen::ford(inner_size)([&](int32_t i) { - double pweight = - (mode == MIOPEN_ELEMENTWISE_AFFINE) ? 1.0 : static_cast(weight[i]); - double pdy = (dy.GetSize() != 0) - ? static_cast(dy[o * inner_size * stride + i * stride + s]) - : 0.0; - double val = prstd * pdy * pweight - - a * static_cast(x[o * inner_size * stride + i * stride + s]) - - b; - - ref_dx[o * inner_size * stride + i * stride + s] = static_cast(val); - }); - }); - }); -} - -template -void cpu_layernorm_backward_weight_bias(tensor dy, - tensor x, - tensor mean, - tensor rstd, - tensor& ref_dw, - tensor& ref_db, - int32_t dim, - bool use_multithread = false) -{ - auto layout = dy.desc.GetLayoutEnum(); - size_t stride = 1; - if(dim > 1 && (layout == miopenTensorNHWC || layout == miopenTensorNDHWC)) - { - stride = dy.desc.GetLengths()[1]; // stride = C - } - - auto dims = dy.desc.GetLengths(); - size_t outer_size = 1; - size_t inner_size = 1; - for(size_t i = 0; i < dims.size(); ++i) - { - if(i < dim) - { - if(!(stride > 1 && i == 1)) - { - outer_size *= dims[i]; - } - } - else - { - inner_size *= dims[i]; - } - } - - size_t min_grain = use_multithread ? 8 : inner_size; - miopen::par_for(inner_size, min_grain, [&](int32_t i) { - double sum_dw = 0.0; - double sum_db = 0.0; - - miopen::ford(stride)([&](int32_t s) { - miopen::ford(outer_size)([&](int32_t o) { - double prstd = static_cast(rstd[o * stride + s]); - double pmean = static_cast(mean[o * stride + s]); - double pdy = (dy.GetSize() != 0) - ? static_cast(dy[o * inner_size * stride + i * stride + s]) - : 0; - double px = static_cast(x[o * inner_size * stride + i * stride + s]); - - sum_dw += pdy * (px - pmean) * prstd; - sum_db += pdy; - }); - }); - - ref_dw[i] = sum_dw; - ref_db[i] = sum_db; - }); -} - +#include #endif diff --git a/projects/miopen/test/cpu_reduce_util.hpp b/projects/miopen/test/cpu_reduce_util.hpp index 88728b02faec..73de3b18e2e1 100644 --- a/projects/miopen/test/cpu_reduce_util.hpp +++ b/projects/miopen/test/cpu_reduce_util.hpp @@ -1,649 +1,5 @@ -/******************************************************************************* - * - * MIT License - * - * Copyright (c) 2020 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - *******************************************************************************/ +// Forwarding header — implementation moved to miopen_utils. #ifndef GUARD_CPU_REDUCE_UTIL_HPP #define GUARD_CPU_REDUCE_UTIL_HPP - -#include "miopen/reducetensor.hpp" -#include "tensor_holder.hpp" -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -namespace reduce { - -template -static inline bool float_equal_one(T); - -static inline bool float_equal_one(float x) { return x == 1.0f; }; - -static inline bool float_equal_one(double x) { return x == 1.0; }; - -static inline bool float_equal_one(half_float::half x) -{ - return x == convert_type(1.0f); -}; - -template -static inline bool float_equal_zero(T x); - -static inline bool float_equal_zero(float x) { return x == 0.0f; }; - -static inline bool float_equal_zero(double x) { return x == 0.0; }; - -static inline bool float_equal_zero(half_float::half x) -{ - return x == convert_type(0.0f); -}; - -template -static inline void build_radix(const std::vector& lens, std::vector& radix) -{ - const std::size_t D = lens.size(); - radix.assign(D, 1); - for(std::size_t d = D; d-- > 1;) - radix[d - 1] = radix[d] * static_cast(lens[d]); // radix[d] = Π_{k>d} lens[k] -} - -// i -> memory offset using lens-radix + actual strides -template -static inline std::size_t linear_to_offset_by_lens_strides(std::size_t i, - const std::vector& lens, - const std::vector& radix, - const std::vector& strides) -{ - std::size_t off = 0; - for(std::size_t d = 0; d < lens.size(); ++d) - { - const std::size_t idx_d = (i / radix[d]) % static_cast(lens[d]); - off += idx_d * static_cast(strides[d]); - } - return off; -} - -template -static inline std::function PreUnaryOpFn(miopenReduceTensorOp_t op_, std::size_t) -{ - using std::abs; - - switch(op_) - { - case MIOPEN_REDUCE_TENSOR_NORM1: return ([&](compType& a_) { a_ = abs(a_); }); - case MIOPEN_REDUCE_TENSOR_NORM2: return ([&](compType& a_) { a_ = a_ * a_; }); - case MIOPEN_REDUCE_TENSOR_AMAX: return ([&](compType& a_) { a_ = abs(a_); }); - - case MIOPEN_REDUCE_TENSOR_AVG: - case MIOPEN_REDUCE_TENSOR_ADD: - case MIOPEN_REDUCE_TENSOR_MUL: - case MIOPEN_REDUCE_TENSOR_MIN: - case MIOPEN_REDUCE_TENSOR_MAX: return ([&](compType&) {}); - } - - throw std::runtime_error(std::string(__FUNCTION__) + - ": using undefined Reduction operation is not permitted"); -}; - -template -static inline std::function PosUnaryOpFn(miopenReduceTensorOp_t op_, - std::size_t divider) -{ - using std::sqrt; - - switch(op_) - { - case MIOPEN_REDUCE_TENSOR_NORM2: return ([&](compType& a_) { a_ = sqrt(a_); }); - - case MIOPEN_REDUCE_TENSOR_AVG: - return ([&, divider](compType& a_) { - a_ = a_ / convert_type(static_cast(divider)); - }); - - case MIOPEN_REDUCE_TENSOR_ADD: - case MIOPEN_REDUCE_TENSOR_NORM1: - case MIOPEN_REDUCE_TENSOR_MUL: - case MIOPEN_REDUCE_TENSOR_MIN: - case MIOPEN_REDUCE_TENSOR_MAX: - case MIOPEN_REDUCE_TENSOR_AMAX: return ([&](compType&) {}); - } - - throw std::runtime_error(std::string(__FUNCTION__) + - ": using undefined Reduction operation is not permitted"); -}; - -template -static inline std::function ReduceOpFn(miopenReduceTensorOp_t op_) -{ - switch(op_) - { - case MIOPEN_REDUCE_TENSOR_ADD: - case MIOPEN_REDUCE_TENSOR_AVG: - case MIOPEN_REDUCE_TENSOR_NORM1: - case MIOPEN_REDUCE_TENSOR_NORM2: return ([&](compType& a_, compType b_) { a_ = a_ + b_; }); - - case MIOPEN_REDUCE_TENSOR_MUL: return ([&](compType& a_, compType b_) { a_ = a_ * b_; }); - - case MIOPEN_REDUCE_TENSOR_MIN: - return ([&](compType& a_, compType b_) { - if(a_ > b_) - a_ = b_; - }); - - case MIOPEN_REDUCE_TENSOR_MAX: - case MIOPEN_REDUCE_TENSOR_AMAX: - return ([&](compType& a_, compType b_) { - if(a_ < b_) - a_ = b_; - }); - } - - throw std::runtime_error(std::string(__FUNCTION__) + - ": using undefined Reduction operation is not permitted"); -}; - -template -static inline std::function -ReduceOpFn2(miopenReduceTensorOp_t op_) -{ - switch(op_) - { - case MIOPEN_REDUCE_TENSOR_MIN: - return ([&](compType& a_, compType b_, bool& changed) { - if(a_ > b_) - { - a_ = b_; - changed = true; - } - else - { - changed = false; - } - }); - - case MIOPEN_REDUCE_TENSOR_MAX: - case MIOPEN_REDUCE_TENSOR_AMAX: - return ([&](compType& a_, compType b_, bool& changed) { - if(a_ < b_) - { - a_ = b_; - changed = true; - } - else - { - changed = false; - } - }); - - case MIOPEN_REDUCE_TENSOR_ADD: - case MIOPEN_REDUCE_TENSOR_MUL: - case MIOPEN_REDUCE_TENSOR_AVG: - case MIOPEN_REDUCE_TENSOR_NORM1: - case MIOPEN_REDUCE_TENSOR_NORM2: return (std::function{}); - }; - - throw std::runtime_error(std::string(__FUNCTION__) + - ": using undefined Reduction operation is not permitted"); -}; - -template -static inline compType ReduceOpZeroVal(miopenReduceTensorOp_t op_) -{ - switch(op_) - { - case MIOPEN_REDUCE_TENSOR_ADD: - case MIOPEN_REDUCE_TENSOR_AVG: - case MIOPEN_REDUCE_TENSOR_NORM1: - case MIOPEN_REDUCE_TENSOR_NORM2: return (convert_type(0.0f)); - - case MIOPEN_REDUCE_TENSOR_MUL: return (convert_type(1.0f)); - - case MIOPEN_REDUCE_TENSOR_MIN: return (std::numeric_limits::max()); - - case MIOPEN_REDUCE_TENSOR_MAX: return (std::numeric_limits::lowest()); - case MIOPEN_REDUCE_TENSOR_AMAX: return (convert_type(0.0f)); - } - - throw std::runtime_error(std::string(__FUNCTION__) + - ": using undefined Reduction operation is not permitted"); -}; - -template -static inline void binop_with_nan_check(miopenNanPropagation_t nanOpt, - reduceOpT&& opReduce, - compType& accuVal, - compType currVal) -{ - using std::isnan; - - if(nanOpt == MIOPEN_NOT_PROPAGATE_NAN) - { - opReduce(accuVal, currVal); - } - else - { - if(isnan(currVal)) - accuVal = currVal; - else - opReduce(accuVal, currVal); - }; -}; - -template -static inline void binop_with_nan_check2(miopenNanPropagation_t nanOpt, - reduceOpT&& opReduce, - compType& accuVal, - compType currVal, - int& accuIndex, - int currIndex) -{ - using std::isnan; - - if(nanOpt == MIOPEN_NOT_PROPAGATE_NAN) - { - bool changed; - - opReduce(accuVal, currVal, changed); - - if(changed) - accuIndex = currIndex; - } - else - { - if(isnan(currVal)) - { - accuVal = currVal; - accuIndex = currIndex; - } - else - { - bool changed; - - opReduce(accuVal, currVal, changed); - - if(changed) - accuIndex = currIndex; - }; - }; -}; - -}; // end of namespace reduce - -template -std::vector> get_all_indexes(const std::vector& lens) -{ - const std::size_t D = lens.size(); - assert(D > 0); - - std::size_t N = 1; - for(const auto L : lens) - N *= static_cast(L); - - std::vector> out; - out.resize(N); - for(auto& row : out) - row.resize(D); - - std::vector stride(D, 1); - for(std::size_t d = D; d-- > 1;) - stride[d - 1] = stride[d] * static_cast(lens[d]); - - for(std::size_t r = 0; r < N; ++r) - { - for(std::size_t d = 0; d < D; ++d) - out[r][d] = static_cast((r / stride[d]) % static_cast(lens[d])); - } - - return out; -} - -template -static inline T -linear_to_offset(size_t li, const std::vector& lens, const std::vector& strides) -{ - T off = 0; - for(int d = int(lens.size()) - 1; d >= 0; --d) - { - const T idx = li % lens[d]; - li /= lens[d]; - off += idx * strides[d]; - } - return off; -} - -template -T get_offset_from_index(const std::vector& strides, const std::vector& index) -{ - T offset = 0; - - assert(strides.size() == index.size()); - - for(int i = 0; i < index.size(); i++) - offset += strides[i] * index[i]; - - return (offset); -}; - -template -T get_flatten_offset(const std::vector& lengths, const std::vector& index) -{ - T offset = 0; - - assert(lengths.size() == index.size() && !lengths.empty()); - - int len = lengths.size(); - T stride = 1; - - // for len==1, the loop is not executed - for(int i = len - 1; i > 0; i--) - { - offset += stride * index[i]; - - stride *= lengths[i]; - }; - - offset += stride * index[0]; - - return (offset); -}; - -template -struct Reducer -{ - compType acc; - bool withIdx; - int idx; // meaningful only when WithIdx==true - miopenNanPropagation_t nanOpt; - // functors for reduction - decltype(reduce::ReduceOpFn(MIOPEN_REDUCE_TENSOR_ADD)) opNoIdx; - decltype(reduce::ReduceOpFn2(MIOPEN_REDUCE_TENSOR_ADD)) opWithIdx; - - Reducer(miopenNanPropagation_t n, miopenReduceTensorOp_t rop, compType zero, bool useIdx) - : acc(zero), - withIdx(useIdx), - idx(0), - nanOpt(n), - opNoIdx(reduce::ReduceOpFn(rop)), - opWithIdx(reduce::ReduceOpFn2(rop)) - { - } - - inline void step(compType v, int flat_i) - { - if(withIdx) - reduce::binop_with_nan_check2(nanOpt, opWithIdx, acc, v, idx, flat_i); - else - reduce::binop_with_nan_check(nanOpt, opNoIdx, acc, v); - } - - inline void combine(const Reducer& other) - { - if(withIdx) - reduce::binop_with_nan_check2(nanOpt, opWithIdx, acc, other.acc, idx, other.idx); - else - reduce::binop_with_nan_check(nanOpt, opNoIdx, acc, other.acc); - } -}; - -template -std::tuple, tensor> reduce_cpu_common(const miopenReduceTensorOp_t& reduceOp, - const miopenNanPropagation_t& nanOpt, - const std::vector& inLengths, - const std::vector& outLengths, - const std::vector& input, - const std::vector& inStrides, - const std::vector& output, - const std::vector& outStrides, - float alpha, - float beta, - bool parallel, - bool withIdx) -{ - using reduce::convert_type; - using reduce::ReduceOpZeroVal; - - // Partition dims - std::vector invariantDims, toReduceDims; - std::vector invLens, redLens, invStrides_v, redStrides_v; - - for(int i = 0; i < static_cast(inLengths.size()); ++i) - { - if(inLengths[i] == outLengths[i]) - { - invariantDims.push_back(i); - invLens.push_back(inLengths[i]); - invStrides_v.push_back(inStrides[i]); - } - else - { - toReduceDims.push_back(i); - redLens.push_back(inLengths[i]); - redStrides_v.push_back(inStrides[i]); - } - } - - const bool reduceAllDims = invariantDims.empty(); - - // unary ops & zero vals - const compType zeroV = ReduceOpZeroVal(reduceOp); - - // divider = Π reduced dims (or N if reduce-all) - std::size_t divider = 1; - if(reduceAllDims) - divider = std::accumulate( - inLengths.begin(), inLengths.end(), std::size_t{1}, std::multiplies<>()); - else - divider = - std::accumulate(redLens.begin(), redLens.end(), std::size_t{1}, std::multiplies<>()); - - auto PreUnaryOp = reduce::PreUnaryOpFn(reduceOp, divider); - auto PosUnaryOp = reduce::PosUnaryOpFn(reduceOp, divider); - - // outputs - auto res = tensor{outLengths}; - res.data = output; - auto res_indices = tensor{outLengths}; - if(withIdx) - std::fill(res_indices.begin(), res_indices.end(), 0); - - if(reduceAllDims) - { - // Flatten whole tensor - const std::size_t N = divider; // product of all dims - std::vector lens_radix; - reduce::build_radix(inLengths, lens_radix); - - // parallel chunking - std::size_t hw = - std::max(std::size_t{1}, static_cast(std::thread::hardware_concurrency())); - const std::size_t P = std::min(N, hw * 4ul); - const std::size_t chunk = (N + P - 1) / P; - - std::vector> partial; - partial.reserve(P); - for(std::size_t p = 0; p < P; ++p) - partial.emplace_back(nanOpt, reduceOp, zeroV, withIdx); - - auto worker = [&](int p) { - const std::size_t begin = std::size_t(p) * chunk; - const std::size_t end = std::min(begin + chunk, N); - - auto& r = partial[p]; - for(std::size_t i = begin; i < end; ++i) - { - const auto off = - reduce::linear_to_offset_by_lens_strides(i, inLengths, lens_radix, inStrides); - auto v = convert_type(input[off]); - PreUnaryOp(v); - r.step(v, static_cast(i)); // flat index across whole tensor - } - }; - - if(parallel) - { - miopen::par_for(static_cast(P), worker); - } - else - { - for(int p = 0; p < P; ++p) - { - worker(p); - } - } - - // combine - Reducer R(nanOpt, reduceOp, zeroV, withIdx); - for(std::size_t p = 0; p < P; ++p) - R.combine(partial[p]); - - // post - PosUnaryOp(R.acc); - if(alpha != 1.0f) - R.acc *= convert_type(alpha); - if(beta != 0.0f) - R.acc += convert_type(output[0]) * convert_type(beta); - - res.data[0] = convert_type(R.acc); - if(withIdx) - res_indices.data[0] = R.idx; - } - else - { - // Build radices for invariant and reduced subspaces - std::vector invRad, redRad; - reduce::build_radix(invLens, invRad); - reduce::build_radix(redLens, redRad); - - const std::size_t INV = - std::accumulate(invLens.begin(), invLens.end(), std::size_t{1}, std::multiplies<>()); - const std::size_t TR = divider; - - std::size_t hw = - std::max(std::size_t{1}, static_cast(std::thread::hardware_concurrency())); - const std::size_t Te = std::min(hw * 4ul, std::max(1, INV)); - const std::size_t chunk = (INV + Te - 1) / Te; - - auto worker = [&](int t) { - const std::size_t row0 = std::size_t(t) * chunk; - const std::size_t row1 = std::min(row0 + chunk, INV); - - for(std::size_t r = row0; r < row1; ++r) - { - // decode invariant multi-index; compute base offsets - std::size_t tmp = r; - std::size_t base_in_off = 0; - std::size_t base_out_off = 0; - for(std::size_t k = 0; k < invLens.size(); ++k) - { - const std::size_t idx = (tmp / invRad[k]) % invLens[k]; - base_in_off += idx * invStrides_v[k]; - base_out_off += idx * outStrides[invariantDims[k]]; - } - - Reducer R(nanOpt, reduceOp, zeroV, withIdx); - - // iterate reduced subspace - for(std::size_t i = 0; i < TR; ++i) - { - std::size_t tmp2 = i; - std::size_t red_off = 0; - for(std::size_t k = 0; k < redLens.size(); ++k) - { - const std::size_t idx = (tmp2 / redRad[k]) % redLens[k]; - red_off += idx * redStrides_v[k]; - } - - auto v = convert_type(input[base_in_off + red_off]); - PreUnaryOp(v); - R.step(v, static_cast(i)); // flat index inside reduced subspace - } - - PosUnaryOp(R.acc); - if(alpha != 1.0f) - R.acc *= convert_type(alpha); - if(beta != 0.0f) - R.acc += - convert_type(output[base_out_off]) * convert_type(beta); - - res.data[base_out_off] = convert_type(R.acc); - if(withIdx) - res_indices.data[base_out_off] = R.idx; - } - }; - - if(parallel) - { - miopen::par_for(static_cast(Te), worker); - } - else - { - for(int te = 0; te < Te; ++te) - { - worker(te); - } - } - } - - return {res, res_indices}; -} - -template -std::tuple, tensor> -reduce_cpu_common(const miopen::ReduceTensorDescriptor& reduceDesc, - const tensor& input, - const tensor& output, - float alpha, - float beta, - bool parallel, - bool withIdx) -{ - auto inLengths = input.desc.GetLengths(); - auto outLengths = output.desc.GetLengths(); - auto inStrides = input.desc.GetStrides(); - auto outStrides = output.desc.GetStrides(); - - const auto reduceOp = reduceDesc.reduceTensorOp_; - const auto nanOpt = reduceDesc.reduceTensorNanOpt_; - - return reduce_cpu_common(reduceOp, - nanOpt, - inLengths, - outLengths, - input.data, - inStrides, - output.data, - outStrides, - alpha, - beta, - parallel, - withIdx); -} - +#include #endif diff --git a/projects/miopen/test/fusionHost.hpp b/projects/miopen/test/fusionHost.hpp index 9693295959d7..a13ee5601cd4 100644 --- a/projects/miopen/test/fusionHost.hpp +++ b/projects/miopen/test/fusionHost.hpp @@ -1,994 +1,3 @@ -/******************************************************************************* - * - * MIT License - * - * Copyright (c) 2018 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - *******************************************************************************/ +// Forwarding header — implementation moved to miopen_utils. #pragma once -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "get_handle.hpp" -#include "tensor_holder.hpp" -#include "verify.hpp" - -template -void convHostForward(const tensor& input, - tensor& output, - const tensor& weights, - const int bias_mode, - const tensor& bias, - const miopenConvolutionDescriptor_t convDesc) -{ - - int in_n, in_c, in_h, in_w; - int in_nstride, in_cstride, in_hstride, in_wstride; - std::tie(in_n, in_c, in_h, in_w) = miopen::tien<4>(input.desc.GetLengths()); - std::tie(in_nstride, in_cstride, in_hstride, in_wstride) = - miopen::tien<4>(input.desc.GetStrides()); - - int wei_n, wei_c, wei_h, wei_w; - int wei_nstride, wei_cstride, wei_hstride, wei_wstride; - std::tie(wei_n, wei_c, wei_h, wei_w) = miopen::tien<4>(weights.desc.GetLengths()); - std::tie(wei_nstride, wei_cstride, wei_hstride, wei_wstride) = - miopen::tien<4>(weights.desc.GetStrides()); - - int out_n, out_c, out_h, out_w; - int out_nstride, out_cstride, out_hstride, out_wstride; - std::tie(out_n, out_c, out_h, out_w) = miopen::tien<4>(output.desc.GetLengths()); - std::tie(out_nstride, out_cstride, out_hstride, out_wstride) = - miopen::tien<4>(output.desc.GetStrides()); - - int stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w; - miopenConvolutionMode_t mode; - miopenPaddingMode_t pmode = miopen::deref(convDesc).paddingMode; - miopenGetConvolutionDescriptor( - convDesc, &mode, &pad_h, &pad_w, &stride_h, &stride_w, &dilation_h, &dilation_w); - - if(pmode == miopenPaddingSame) - { - pad_h = (in_h % stride_h == 0) ? (std::max((wei_h - stride_h), 0)) - : (std::max((wei_h - (in_h % stride_h)), 0)); - pad_w = (in_w % stride_w == 0) ? (std::max((wei_w - stride_w), 0)) - : (std::max((wei_w - (in_w % stride_w)), 0)); - pad_h /= 2; - pad_w /= 2; - } - else if(pmode == miopenPaddingValid) - { - pad_h = 0; - pad_w = 0; - } - - if(out_h <= 0 || out_w <= 0) - MIOPEN_THROW("Invalid Test Case: Check Output Dimension."); - - for(int o = 0; o < out_n; o++) - { // mini-batch size - for(int w = 0; w < out_c; w++) - { // out_channels (num filters) - for(int i = 0; i < out_h; i++) - { // output_height (from getforwardoutputdim()) - int in_off_h = i * stride_h; - for(int j = 0; j < out_w; j++) - { // output_width (from getforwardoutputdim()) - /*auto acc = static_cast(0.);*/ - auto acc = static_cast(0.); - int in_off_w = j * stride_w; - for(int k = 0; k < in_c; k++) - { // in_channels (RGB) - for(int x = 0; x < wei_h; x++) - { - int in_x = in_off_h - pad_h + x * dilation_h; - if(in_x >= 0 && in_x < in_h) - { - for(int y = 0; y < wei_w; y++) - { - int in_y = in_off_w - pad_w + y * dilation_w; - if(in_y >= 0 && in_y < in_w) - { - acc += double( - static_cast(input[o * in_nstride + k * in_cstride + - in_x * in_w + in_y]) * - static_cast(weights(w, k, x, y))); - } - } - } - } - } - acc = bias_mode != 0 ? acc + static_cast(bias[w]) : acc; - output[o * out_nstride + w * out_cstride + i * out_hstride + j] = - static_cast(acc); - } - } - } - } -} - -template -void batchNormSpatialHostInference(const tensor& input, - tensor& output, - const tensor& scale, - const tensor& bias, - double epsilon, - const tensor& estimatedMean, - const tensor& estimatedVariance, - bool useInverseVariance = false) -{ - - int n_batches, channels, height, width; - std::tie(n_batches, channels, height, width) = miopen::tien<4>(input.desc.GetLengths()); - miopen::par_for(channels, 1, [&](int cidx) { // via channel - V mean = estimatedMean(0, cidx, 0, 0); - V variance = estimatedVariance(0, cidx, 0, 0); - double invertVar = - useInverseVariance ? static_cast(variance) : 1.0 / sqrt(variance + epsilon); - // process the batch per channel - for(int row = 0; row < height; row++) - { // via rows - for(int column = 0; column < width; column++) - { // via columns - for(int bidx = 0; bidx < n_batches; bidx++) - { // via mini_batch - double elemStd = static_cast(input(bidx, cidx, row, column)) - mean; - double inhat = elemStd * invertVar; - output(bidx, cidx, row, column) = - static_cast(scale(0, cidx, 0, 0) * inhat + bias(0, cidx, 0, 0)); - // printf("output: %f\n",scale(0, cidx, 0, 0) * inhat + bias(0, cidx, 0, 0)); - } - } - } - }); -} - -template -void batchNormPerActivHostInference(const tensor& input, - tensor& output, - const tensor& scale, - const tensor& bias, - double epsilon, - const tensor& estimatedMean, - const tensor& estimatedVariance, - bool useInverseVariance = false) -{ - int n_batches, channels, height, width; - std::tie(n_batches, channels, height, width) = miopen::tien<4>(input.desc.GetLengths()); - miopen::par_for(channels, 1, [&](int cidx) { // via channel - for(int row = 0; row < height; row++) - { // via rows - for(int column = 0; column < width; column++) - { // via columns - // apply down the n_batch dimension - double mean = estimatedMean(0, cidx, row, column); - double variance = estimatedVariance(0, cidx, row, column); - double elemInvVar = useInverseVariance ? variance : 1.0 / sqrt(variance + epsilon); - for(int bidx = 0; bidx < n_batches; bidx++) - { // via mini_batch - // per (x-dims) channel load a block of data into LDS - double elemStd = input(bidx, cidx, row, column) - mean; - double inhat = elemStd * elemInvVar; - output(bidx, cidx, row, column) = - scale(0, cidx, row, column) * inhat + bias(0, cidx, row, column); - // printf("output: %f\n",output(bidx, cidx, row, column)); - } - } - } - }); -} - -template -void batchNormSpatialHostFwdTrain(const tensor& input, - tensor& out, - const tensor& scale, - const tensor& bias, - double epsilon, - double expAvgFactor, - tensor& saveMean, - tensor& saveInvVar, - tensor& runMean, - tensor& runVar) -{ - - int height, width, n_batch, channels; - std::tie(n_batch, channels, height, width) = miopen::tien<4>(input.desc.GetLengths()); - const auto nhw = double(height * width * n_batch); - - miopen::par_for(channels, 1, [&](int cidx) { - double elemStd = 0.; - double variance_accum = 0.; - double mean_accum = 0.; - double invVar = 0.; - double newRunMean = 0.; - double adjust = 0.; - - // process the batch per channel - for(int bidx = 0; bidx < n_batch; bidx++) - { // via mini_batch - for(int row = 0; row < height; row++) - { // via rows - for(int column = 0; column < width; column++) - { // via columns - // #1 calculate the mean - // iterating through the stack of images in the mini_batch - auto inval = static_cast(input(bidx, cidx, row, column)); - mean_accum += inval; - variance_accum += inval * inval; - } // end for (column) - } // end for (row) - } // end for (n) - - mean_accum /= nhw; - variance_accum /= nhw; - variance_accum += (-mean_accum * mean_accum); - invVar = 1.0 / sqrt(variance_accum + epsilon); - - // #4 apply the normalization - // x_hat = (x_i - mean) / sqrt(variance_accum + epsilon) - for(int bidx = 0; bidx < n_batch; bidx++) - { // via mini_batch - for(int row = 0; row < height; row++) - { // via rows - for(int column = 0; column < width; column++) - { // via columns - // #5 Gamma and Beta adjust - // y_i = gamma*x_hat + beta - elemStd = (static_cast(input(bidx, cidx, row, column)) - - mean_accum); // (x_i - mean) - out(bidx, cidx, row, column) = static_cast( - scale(0, cidx, 0, 0) * (invVar * elemStd) + bias(0, cidx, 0, 0)); - } // for (column) - } // for (row) - } // end for(n_batchs) - if(!saveMean.data.empty()) - { - saveMean(0, cidx, 0, 0) = mean_accum; - saveInvVar(0, cidx, 0, 0) = invVar; - } - if(!runMean.data.empty()) - { - newRunMean = runMean(0, cidx, 0, 0) * (1 - expAvgFactor); - runMean(0, cidx, 0, 0) = mean_accum * expAvgFactor + newRunMean; // newMean*factor + tmp - // var(n+1) = p * var(n-1) + (1 - p)*(b/b-1)*var(n) - adjust = (n_batch * height * width == 1) ? variance_accum - : (nhw / (nhw - 1)) * variance_accum; - runVar(0, cidx, 0, 0) = - (1 - expAvgFactor) * runVar(0, cidx, 0, 0) + expAvgFactor * adjust; - } - }); -} - -template -void batchNormSpatialHostBwdTrain(const tensor& x_input, - tensor& dy_input, - tensor& dx_out, - const tensor& bnScale, - const tensor& bnBias, - tensor& dscale, - tensor& dbias, - const tensor& savedMean, - const tensor& savedInvVar, - miopenActivationMode_t activ_mode, - double activ_beta, - double activ_alpha) -{ - double activ_gamma = 0.; - int height, width, n_batch, channels; - std::tie(n_batch, channels, height, width) = miopen::tien<4>(x_input.desc.GetLengths()); - auto nhw = double(height * width * n_batch); - int in_cstride = height * width; - - if(activ_mode > 0) - { - tensor input_norm = - tensor{x_input.desc.GetLayout_t(), x_input.desc.GetLengths()}; - miopen::par_for(channels, 1, [&](int cidx) { - double mean = 0.0; - double invVar = 0.0; - double elemStd = 0.; - double mean_accum = 0.0; - double variance_accum = 0.0; - if(!savedMean.data.empty()) - { - mean = static_cast(savedMean(0, cidx, 0, 0)); // HxW elements - invVar = static_cast(savedInvVar(0, cidx, 0, 0)); // HxW elements - } - else - { - for(int row = 0; row < height; row++) - { // via rows - for(int column = 0; column < width; column++) - { // via columns - for(int bidx = 0; bidx < n_batch; bidx++) - { // via mini_batch - auto inval = static_cast(x_input(bidx, cidx, row, column)); - mean_accum += inval; - variance_accum += inval * inval; - } - } - } - mean_accum /= nhw; - variance_accum /= nhw; - variance_accum += (-mean_accum * mean_accum); - mean = mean_accum; - invVar = 1.0 / sqrt(variance_accum); - } - for(int row = 0; row < height; row++) - { // via rows - for(int column = 0; column < width; column++) - { // via columns - for(int bidx = 0; bidx < n_batch; bidx++) - { // via mini_batch - elemStd = static_cast(x_input(bidx, cidx, row, column)) - - mean; // (x_i - mean) - input_norm(bidx, cidx, row, column) = static_cast( - bnScale(0, cidx, 0, 0) * (elemStd * invVar) + bnBias(0, cidx, 0, 0)); - } - } - } - }); - - activationHostBnormBwd(activ_mode, - activ_gamma, - activ_beta, - activ_alpha, - dy_input.data, - input_norm.data, - dy_input.data); - } - - miopen::par_for(channels, 1, [&](int cidx) { - double elemStd = 0.; - unsigned int xhat_index; - double mean = 0.0; - double invVar = 0.0; - double dyelem = 0.; - std::vector xhat(static_cast(n_batch) * in_cstride, 0.0); - // process the batch per channel - dscale(0, cidx, 0, 0) = 0.; - dbias(0, cidx, 0, 0) = 0.; - - if(!savedMean.data.empty()) - { - - mean = savedMean(0, cidx, 0, 0); // HxW elements - invVar = savedInvVar(0, cidx, 0, 0); // HxW elements - } - else - { - double variance_accum = 0.; - double mean_accum = 0.; - double inv_Var = 0.; - - // process the batch per channel - for(int bidx = 0; bidx < n_batch; bidx++) - { // via mini_batch - for(int row = 0; row < height; row++) - { // via rows - for(int column = 0; column < width; column++) - { // via columns - // #1 calculate the mean - // iterating through the stack of images in the mini_batch - auto inval = static_cast(x_input(bidx, cidx, row, column)); - mean_accum += inval; - variance_accum += inval * inval; - } // end for (column) - } // end for (row) - } // end for (n) - - mean_accum /= nhw; - variance_accum /= nhw; - variance_accum += (-mean_accum * mean_accum); - inv_Var = 1.0 / sqrt(variance_accum); - - mean = mean_accum; - invVar = inv_Var; - } - for(int row = 0; row < height; row++) - { // via rows - for(int column = 0; column < width; column++) - { // via columns - for(int bidx = 0; bidx < n_batch; bidx++) - { // via mini_batch - xhat_index = in_cstride * bidx + (width * row + column); - // per (x-dims) channel load a block of data into LDS - elemStd = static_cast(x_input(bidx, cidx, row, column)) - - mean; // (x_i - mean) - xhat[xhat_index] = elemStd * invVar; - dyelem = static_cast(dy_input(bidx, cidx, row, column)); - dbias(0, cidx, 0, 0) += dyelem; - dscale(0, cidx, 0, 0) += xhat[xhat_index] * dyelem; - } // end for(n_batch) - } // for (column) - } // for (row) - - for(int row = 0; row < height; row++) - { // via rows - for(int column = 0; column < width; column++) - { // via columns - for(int bidx = 0; bidx < n_batch; bidx++) - { // via mini_batch - xhat_index = in_cstride * bidx + (width * row + column); - - double tmp1 = nhw * dy_input(bidx, cidx, row, column) - dbias(0, cidx, 0, 0); - double tmp2 = -xhat[xhat_index] * dscale(0, cidx, 0, 0); - double tmp3 = (bnScale(0, cidx, 0, 0) * invVar) / nhw; - dx_out(bidx, cidx, row, column) = - static_cast(tmp3 * (tmp2 + tmp1)); - } // end for(n_batchs) - } // for (column) - } // for (row) - }); // for (channel) -} - -template -void batchNormActivSpatialHostBwdTrain(miopenActivationMode_t activMode, - double gamma, - double beta, - double alpha, - const tensor& x_input, - const tensor& dy_input, - const tensor& y_input, - tensor& dx_out, - const tensor& bnScale, - const tensor& bias, - tensor& dscale, - tensor& dbias, - const tensor& savedMean, - const tensor& savedInvVar) -{ - - int height, width, n_batch, channels; - std::tie(n_batch, channels, height, width) = miopen::tien<4>(x_input.desc.GetLengths()); - auto nhw = double(height * width * n_batch); - int in_cstride = height * width; - - miopen::par_for(channels, 1, [&](int cidx) { - double elemStd = 0.; - unsigned int xhat_index; - double mean = static_cast(savedMean(0, cidx, 0, 0)); // HxW elements - double invVar = static_cast(savedInvVar(0, cidx, 0, 0)); // HxW elements - double dyelem = 0.; - std::vector xhat(static_cast(n_batch) * in_cstride, 0.0); - // process the batch per channel - dscale(0, cidx, 0, 0) = 0.; - dbias(0, cidx, 0, 0) = 0.; - - for(int row = 0; row < height; row++) - { // via rows - for(int column = 0; column < width; column++) - { // via columns - for(int bidx = 0; bidx < n_batch; bidx++) - { // via mini_batch - - // recompute forward batch norm - xhat_index = in_cstride * bidx + (width * row + column); - // per (x-dims) channel load a block of data into LDS - elemStd = static_cast(x_input(bidx, cidx, row, column)) - - mean; // (x_i - mean) - xhat[xhat_index] = elemStd * invVar; - double bnrefowd = - bnScale(0, cidx, 0, 0) * xhat[xhat_index] + bias(0, cidx, 0, 0); - activationHostBwdElement(activMode, - gamma, - beta, - alpha, - dy_input(bidx, cidx, row, column), - bnrefowd, - y_input(bidx, cidx, row, column), - dyelem); - dbias(0, cidx, 0, 0) += dyelem; - dscale(0, cidx, 0, 0) += xhat[xhat_index] * dyelem; - } // end for(n_batch) - } // for (column) - } // for (row) - - for(int row = 0; row < height; row++) - { // via rows - for(int column = 0; column < width; column++) - { // via columns - for(int bidx = 0; bidx < n_batch; bidx++) - { // via mini_batch - xhat_index = in_cstride * bidx + (width * row + column); - double bnrefowd = - bnScale(0, cidx, 0, 0) * xhat[xhat_index] + bias(0, cidx, 0, 0); - activationHostBwdElement(activMode, - gamma, - beta, - alpha, - dy_input(bidx, cidx, row, column), - bnrefowd, - y_input(bidx, cidx, row, column), - dyelem); - // double tmp1 = nhw * dy_input(bidx, cidx, row, column) - dbias(0, cidx, 0, 0); - double tmp1 = nhw * dyelem - dbias(0, cidx, 0, 0); - double tmp2 = -xhat[xhat_index] * dscale(0, cidx, 0, 0); - double tmp3 = (bnScale(0, cidx, 0, 0) * invVar) / nhw; - dx_out(bidx, cidx, row, column) = static_cast(tmp3 * (tmp2 + tmp1)); - } // end for(n_batchs) - } // for (column) - } // for (row) - }); // for (channel) -} - -template -void batchNormPerActHostFwdTrain(const tensor& input, - tensor& out, - const tensor& scale, - const tensor& bias, - double epsilon, - double expAvgFactor, - tensor& saveMean, - tensor& saveInvVar, - tensor& runMean, - tensor& runVar) -{ - - int height, width, n_batch, channels; - std::tie(n_batch, channels, height, width) = miopen::tien<4>(input.desc.GetLengths()); - const auto n = double(n_batch); - - miopen::par_for(channels, 1, [&](int cidx) { - double mean_accum = 0.; - double variance_accum = 0.; - double elemStd = 0.; - double elemInvVar = 0.; - double inhat = 0.; - double newRunMean = 0.; - double adjust = 0.; - - // process the batch per channel - for(int row = 0; row < height; row++) - { // via rows - for(int column = 0; column < width; column++) - { // via columns - - mean_accum = 0.; - variance_accum = 0.; - for(int bidx = 0; bidx < n_batch; bidx++) - { // via mini_batch - // #1 calculate the mean :: iterating through the stack of images in the - // mini_batch - auto intval = static_cast(input(bidx, cidx, row, column)); - mean_accum += intval; - variance_accum += intval * intval; - } - mean_accum /= n; - variance_accum /= n; - variance_accum = variance_accum - (mean_accum * mean_accum); - elemInvVar = 1.0 / double(sqrt(variance_accum + epsilon)); - - // #4 apply the normalization :: x_hat = (x_i - mean) / sqrt(variance_accum - - // epsilon) - for(int bidx = 0; bidx < n_batch; bidx++) - { // via mini_batch - elemStd = (input(bidx, cidx, row, column) - mean_accum); // (x_i - mean) - inhat = elemStd * elemInvVar; - // #5 Gamma and Beta adjust :: y_i = gamma*x_hat + beta - out(bidx, cidx, row, column) = static_cast( - scale(0, cidx, row, column) * inhat + bias(0, cidx, row, column)); - } // end for(n_batch) - - if(!runMean.data.empty()) - { - newRunMean = runMean(0, cidx, row, column) * (1.0 - expAvgFactor); - runMean(0, cidx, row, column) = - mean_accum * expAvgFactor + newRunMean; // newMean*factor + tmp - } - // var(n+1) = p * var(n-1) + (1 - p)*(b/b-1)*var(n) - if(!runVar.data.empty()) - { - adjust = (n_batch == 1) ? variance_accum : (n / (n - 1.0)) * variance_accum; - runVar(0, cidx, row, column) = - (1 - expAvgFactor) * runVar(0, cidx, row, column) + expAvgFactor * adjust; - } - if(!saveMean.data.empty() || !saveInvVar.data.empty()) - { - saveMean(0, cidx, row, column) = static_cast(mean_accum); - saveInvVar(0, cidx, row, column) = static_cast(elemInvVar); - } - - } // for (column) - } // for (row) - }); -} - -template -void batchNormPerActHostBwdTrain(const tensor& x_input, - const tensor& dy_input, - tensor& dx_out, - const tensor& scale, - tensor& dscale, - tensor& dbias, - const tensor& savedMean, - const tensor& savedInvVar) -{ - - int height, width, n_batch, channels; - std::tie(n_batch, channels, height, width) = miopen::tien<4>(x_input.desc.GetLengths()); - int in_cstride = height * width; - auto n = double(n_batch); - - miopen::par_for(channels, 1, [&](int cidx) { - double elemStd = 0.; - unsigned int xhat_index; - double mean = 0.; - double elemInvVar = 0.; - double dyelem = 0.; - double dxhat = 0.; - double dxhathat = 0.; - double tmp1 = 0.; - std::vector xhat(static_cast(n_batch) * in_cstride); - - // process the batch per channel - for(int row = 0; row < height; row++) - { // via rows - for(int column = 0; column < width; column++) - { // via columns - dxhat = 0.; - dxhathat = 0.; - - if(!savedMean.data.empty()) - { - mean = savedMean(0, cidx, row, column); // HxW elements - elemInvVar = savedInvVar(0, cidx, row, column); // HxW elements - } - else - { - double variance_accum = 0.; - double mean_accum = 0.; - - // process the batch per channel - for(int bidx = 0; bidx < n_batch; bidx++) - { // via mini_batch - auto inval = static_cast(x_input(bidx, cidx, row, column)); - mean_accum += inval; - variance_accum += inval * inval; - } // end for (n) - - mean_accum /= n; - variance_accum /= n; - variance_accum += (-mean_accum * mean_accum); - - mean = mean_accum; - elemInvVar = 1.0 / sqrt(variance_accum); - } - - for(int bidx = 0; bidx < n_batch; bidx++) - { // via mini_batch - xhat_index = in_cstride * bidx + (width * row + column); - // per (x-dims) channel load a block of data into LDS - elemStd = static_cast(x_input(bidx, cidx, row, column)) - - mean; // (x_i - mean) - xhat[xhat_index] = elemStd * elemInvVar; - dyelem = static_cast(dy_input(bidx, cidx, row, column)); - dbias(0, cidx, row, column) += dyelem; - dscale(0, cidx, row, column) += xhat[xhat_index] * dyelem; - tmp1 = scale(0, cidx, row, column) * dyelem; - dxhat += tmp1; - dxhathat += tmp1 * xhat[xhat_index]; - - } // end for(n_batchs) - - for(int bidx = 0; bidx < n_batch; bidx++) - { // via mini_batch - xhat_index = in_cstride * bidx + (width * row + column); - tmp1 = xhat[xhat_index] * dxhathat + dxhat; - double tmp2 = - n_batch * scale(0, cidx, row, column) * dy_input(bidx, cidx, row, column) - - tmp1; - double tmp3 = elemInvVar / (double(n)); - dx_out(bidx, cidx, row, column) = static_cast(tmp3 * tmp2); - } // end for(n_batchs) - } // for (column) - } // for (row) - }); -} - -template -void batchNormActivPerActHostBwdTrain(miopenActivationMode_t activMode, - double gamma, - double beta, - double alpha, - const tensor& x_input, - const tensor& dy_input, - const tensor& y_input, - tensor& dx_out, - const tensor& scale, - const tensor& bias, - tensor& dscale, - tensor& dbias, - const tensor& savedMean, - const tensor& savedInvVar) -{ - - int height, width, n_batch, channels; - std::tie(n_batch, channels, height, width) = miopen::tien<4>(x_input.desc.GetLengths()); - int in_cstride = height * width; - auto n = double(n_batch); - - miopen::par_for(channels, 1, [&](int cidx) { - double elemStd = 0.; - unsigned int xhat_index; - double mean = 0.; - double elemInvVar = 0.; - double dyelem = 0.; - double dxhat = 0.; - double dxhathat = 0.; - double tmp1 = 0.; - std::vector xhat(static_cast(n_batch) * in_cstride); - - // process the batch per channel - for(int row = 0; row < height; row++) - { // via rows - for(int column = 0; column < width; column++) - { // via columns - dxhat = 0.; - dxhathat = 0.; - - mean = savedMean(0, cidx, row, column); // HxW elements - elemInvVar = savedInvVar(0, cidx, row, column); // HxW elements - - for(int bidx = 0; bidx < n_batch; bidx++) - { // via mini_batch - xhat_index = in_cstride * bidx + (width * row + column); - // per (x-dims) channel load a block of data into LDS - elemStd = static_cast(x_input(bidx, cidx, row, column)) - - mean; // (x_i - mean) - xhat[xhat_index] = elemStd * elemInvVar; - double bnrefowd = - scale(0, cidx, row, column) * xhat[xhat_index] + bias(0, cidx, row, column); - activationHostBwdElement(activMode, - gamma, - beta, - alpha, - dy_input(bidx, cidx, row, column), - bnrefowd, - y_input(bidx, cidx, row, column), - dyelem); - /*dyelem = static_cast(dy_input(bidx, cidx, row, column));*/ - dbias(0, cidx, row, column) += dyelem; - dscale(0, cidx, row, column) += xhat[xhat_index] * dyelem; - tmp1 = scale(0, cidx, row, column) * dyelem; - dxhat += tmp1; - dxhathat += tmp1 * xhat[xhat_index]; - - } // end for(n_batchs) - - for(int bidx = 0; bidx < n_batch; bidx++) - { // via mini_batch - xhat_index = in_cstride * bidx + (width * row + column); - tmp1 = xhat[xhat_index] * dxhathat + dxhat; - double bnrefowd = - scale(0, cidx, row, column) * xhat[xhat_index] + bias(0, cidx, row, column); - activationHostBwdElement(activMode, - gamma, - beta, - alpha, - dy_input(bidx, cidx, row, column), - bnrefowd, - y_input(bidx, cidx, row, column), - dyelem); - double tmp2 = (n_batch * scale(0, cidx, row, column) * dyelem) - tmp1; - double tmp3 = elemInvVar / (double(n)); - dx_out(bidx, cidx, row, column) = static_cast(tmp3 * tmp2); - } // end for(n_batchs) - } // for (column) - } // for (row) - }); -} - -template -void visitActivationHostInfer( - miopenActivationMode_t activMode, double gamma, double beta, double alpha, F f) -{ - switch(activMode) - { - case miopenActivationPASTHRU: // x - f([=](double x) { return x; }); - break; - case miopenActivationLOGISTIC: // 1 / (1 + e^-x) //Sigmoid - f([=](double x) { return (1. / (1. + std::exp(-x))); }); - break; - case miopenActivationTANH: // beta * tanh(alpha * x) - f([=](double x) { return (beta * std::tanh(alpha * x)); }); - break; - case miopenActivationRELU: // max(0, x) - f([=](double x) { return ((x > 0.) ? x : 0.); }); - break; - case miopenActivationSOFTRELU: // log(1 + e^x) // bonomial normal log likelihood - f([=](double x) { - return (x > 0.) ? (x + std::log1p(std::exp(-x))) : (std::log1p(std::exp(x))); - }); - break; - case miopenActivationABS: // abs(x) - f([=](double x) { return (std::fabs(x)); }); - break; - case miopenActivationPOWER: // (alpha + beta * x) ^ gamma - f([=](double x) { - auto v = (alpha + beta * x); - return (v <= std::numeric_limits::epsilon()) ? 0. : pow(v, gamma); - }); - break; - case miopenActivationCLIPPEDRELU: // min(alpha, max(0, x)) - f([=](double x) { return (std::min(alpha, std::max(double(0.), x))); }); - break; - case miopenActivationLEAKYRELU: // alpha * x | x<=0; x | x>0 - f([=](double x) { return ((x > 0.) ? x : x * alpha); }); - break; - case miopenActivationELU: // alpha * (exp(x)-1) | x<=0; x | x>0 - f([=](double x) { return ((x > 0.) ? x : alpha * std::expm1(x)); }); - break; - case miopenActivationCLAMP: // max(alpha, min(beta, x)) - f([=](double x) { return (std::max(alpha, std::min(beta, x))); }); - break; - // default: printf("ERROR: unknown neuron type: %d\n", activMode); break; - } -} - -template -inline void activationHostInfer(miopenActivationMode_t activMode, - double gamma, - double beta, - double alpha, - const std::vector input, - std::vector& output) -{ - visitActivationHostInfer(activMode, gamma, beta, alpha, [&](auto f) { - miopen::par_for(input.size(), 1, [&](int index) { - output[index] = static_cast(f(static_cast(input[index]))); - }); - }); -} - -template -void visitActivationHostBwd( - miopenActivationMode_t activMode, double gamma, double beta, double alpha, F f) -{ - switch(activMode) - { - case miopenActivationPASTHRU: // x - f([=](double dy, double, double) { return dy; }); - break; - case miopenActivationLOGISTIC: // 1 / (1 + e^-x) //Sigmoid - f([=](double dy, double, double y) { return dy * y * (1 - y); }); - break; - case miopenActivationTANH: // beta * tanh(alpha * x) - f([=](double dy, double, double y) { return dy * alpha * (beta - y * y / beta); }); - break; - case miopenActivationRELU: // max(0, x) - f([=](double dy, double x, double) { return (x > 0) ? dy : 0; }); - break; - case miopenActivationSOFTRELU: // log(1 + e^x) // bonomial normal log likelihood - f([=](double dy, double x, double) { - static const double threshold = 50.; - double expval = std::exp(std::min(x, threshold)); - return dy * expval / (expval + 1.0); - }); - break; - case miopenActivationABS: // abs(x) - f([=](double dy, double x, double) { return dy * ((x > 0) ? 1 : -1); }); - break; - case miopenActivationPOWER: // (alpha + beta * x) ^ gamma - f([=](double, double x, double y) { - auto v = alpha + beta * x; - return v <= std::numeric_limits::epsilon() ? 0 : gamma * beta * y / v; - }); - break; - case miopenActivationCLIPPEDRELU: // min(alpha, max(0, x)) - f([=](double dy, double x, double) { return (x > 0 && x <= alpha) ? dy : 0; }); - break; - case miopenActivationLEAKYRELU: // alpha * x | x<=0; x | x>0 - f([=](double dy, double x, double) { return dy * ((x > 0) ? 1 : alpha); }); - break; - case miopenActivationELU: // alpah * (exp(x)-1) | x<=0; x | x>0 - f([=](double dy, double x, double y) { return dy * ((x > 0) ? 1 : y + alpha); }); - break; - case miopenActivationCLAMP: // max(alpha, min(beta, x)) - f([=](double dy, double x, double) { return (x > alpha && x <= beta) ? dy : 0; }); - break; - // default: printf("ERROR: unknown neuron type: %d\n", activMode); break; - } -} - -template -inline void activationHostBnormBwd(miopenActivationMode_t activMode, - double gamma, - double beta, - double alpha, - const std::vector dyinput, - const std::vector xinput, - std::vector& output) -{ - double dummy; - visitActivationHostBwd(activMode, gamma, beta, alpha, [&](auto f) { - miopen::par_for(dyinput.size(), 1, [&](int index) { - output[index] = static_cast( - f(static_cast(dyinput[index]), static_cast(xinput[index]), dummy)); - }); - }); -} - -template -inline void activationHostBwd(miopenActivationMode_t activMode, - double gamma, - double beta, - double alpha, - const std::vector dyinput, - const std::vector xinput, - const std::vector yinput, - std::vector& output) -{ - visitActivationHostBwd(activMode, gamma, beta, alpha, [&](auto f) { - miopen::par_for(dyinput.size(), 1, [&](int index) { - output[index] = static_cast(f(static_cast(dyinput[index]), - static_cast(xinput[index]), - static_cast(yinput[index]))); - }); - }); -} - -inline void activationHostBwdElement(miopenActivationMode_t activMode, - double gamma, - double beta, - double alpha, - const double dyinput, - const double xinput, - const double yinput, - double& output) -{ - visitActivationHostBwd(activMode, gamma, beta, alpha, [&](auto f) { - output = static_cast(f(dyinput, xinput, yinput)); - }); -} - -template -tensor get_output_tensor(const miopen::ConvolutionDescriptor& filter, - const tensor& input, - const tensor& weights) -{ - return tensor{filter.GetForwardOutputTensor(input.desc, weights.desc, miopen_type{})}; -} +#include diff --git a/projects/miopen/test/gemm.hpp b/projects/miopen/test/gemm.hpp index 81c38db0fdf3..34fa7db11bec 100644 --- a/projects/miopen/test/gemm.hpp +++ b/projects/miopen/test/gemm.hpp @@ -1,120 +1,5 @@ -/******************************************************************************* - * - * MIT License - * - * Copyright (c) 2017 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - *******************************************************************************/ +// Forwarding header — implementation moved to miopen_utils. #ifndef GUARD_GEMM_HPP #define GUARD_GEMM_HPP - -#include -#include -#include - -/* - A and B rows and cols should be passed as default values (NxM, MxK), independently of - a_transponse/b_transpose flag value - C rows and cols should have correct values based on a_transponse/b_transpose values - A, B, C strides should have corret values based on a_transponse/b_transpose values -*/ -template -void gemm_cpu(const Dtype* a_ptr, - const size_t a_cols, - const size_t a_rows, - const size_t a_stride, - const bool a_transpose, - const Dtype* b_ptr, - const size_t b_cols, - const size_t b_rows, - const size_t b_stride, - const bool b_transpose, - Dtype* c_ptr, - const size_t c_cols, - const size_t c_rows, - const size_t c_stride, - double alpha = 1.0, - double beta = 1.0) -{ - if((!a_transpose && !b_transpose && - ((a_cols != b_rows) || (a_rows != c_rows) || (b_cols != c_cols))) || - (a_transpose && b_transpose && - ((a_rows != b_cols) || (a_cols != c_rows) || (b_rows != c_cols))) || - (a_transpose && !b_transpose && - ((a_rows != b_rows) || (a_cols != c_rows) || (b_cols != c_cols))) || - (!a_transpose && b_transpose && - ((a_cols != b_cols) || (a_rows != c_rows) || (b_rows != c_cols)))) - { - MIOPEN_THROW("MM_CPU_ERROR. Incompatible matrix size:\nA: " + std::to_string(a_rows) + "x" + - std::to_string(a_cols) + " transpose: " + (a_transpose ? "true" : "false") + - "\nB: " + std::to_string(b_rows) + "x" + std::to_string(b_cols) + - " transpose: " + (b_transpose ? "true" : "false") + - "\nC: " + std::to_string(c_rows) + "x" + std::to_string(c_cols) + "\n"); - } - - size_t inner_loop_limit = a_transpose ? a_rows : a_cols; - auto inner_loop = [&](int m, int n) { - double el = 0.0; - if(!a_transpose && !b_transpose) - { - miopen::ford(inner_loop_limit)([&](int k) { - el += static_cast(a_ptr[m * a_stride + k]) * - static_cast(b_ptr[k * b_stride + n]); - }); - } - else if(!a_transpose && b_transpose) - { - miopen::ford(inner_loop_limit)([&](int k) { - el += static_cast(a_ptr[m * a_stride + k]) * - static_cast(b_ptr[n * b_stride + k]); - }); - } - else if(a_transpose && !b_transpose) - { - miopen::ford(inner_loop_limit)([&](int k) { - el += static_cast(a_ptr[k * a_stride + m]) * - static_cast(b_ptr[k * b_stride + n]); - }); - } - else - { - miopen::ford(inner_loop_limit)([&](int k) { - el += static_cast(a_ptr[k * a_stride + m]) * - static_cast(b_ptr[n * b_stride + k]); - }); - } - - c_ptr[m * c_stride + n] = - static_cast(beta * static_cast(c_ptr[m * c_stride + n]) + alpha * el); - }; - - constexpr size_t iter_margin = 1'048'576; // 2^20 - if(c_rows * c_cols * inner_loop_limit > iter_margin) - { - miopen::par_ford(c_rows, c_cols)(inner_loop); - } - else - { - miopen::ford(c_rows, c_cols)(inner_loop); - } -} - +#include #endif diff --git a/projects/miopen/test/network_data.hpp b/projects/miopen/test/network_data.hpp index 987d4dda9929..7a0dbcd702dd 100644 --- a/projects/miopen/test/network_data.hpp +++ b/projects/miopen/test/network_data.hpp @@ -1,438 +1,5 @@ -/******************************************************************************* - * - * MIT License - * - * Copyright (c) 2017 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - *******************************************************************************/ - +// Forwarding header — implementation moved to miopen_utils. #ifndef GUARD_MIOPEN_TEST_NETWORK_DATA_HPP #define GUARD_MIOPEN_TEST_NETWORK_DATA_HPP - -#include -#include -#include -#include - -#ifndef MIOPEN_TEST_DEFAULT_BATCH_SIZE_FACTOR -#define MIOPEN_TEST_DEFAULT_BATCH_SIZE_FACTOR 0 -#endif - -template -inline constexpr T pick_batch_size(T x, T y) -{ - return (y == 0 || y > x) ? 1 : x / y; -} - -// Reduce tests execution time -#define MIOPEN_TESTS_GET_INPUTS_ENABLE_HUGE_TENSORS 1 - -template -inline std::set> get_inputs(T n = MIOPEN_TEST_DEFAULT_BATCH_SIZE_FACTOR) -{ - // clang-format off - return - { - { pick_batch_size(32, n), 1, 14, 14 }, - { pick_batch_size(100, n), 1, 8, 8 }, - { pick_batch_size(256, n), 1, 27, 27 }, -#if MIOPEN_TESTS_GET_INPUTS_ENABLE_HUGE_TENSORS - { pick_batch_size(64, n), 19, 1024,2048}, -#endif - { pick_batch_size(100, n), 3, 32, 32 }, - { pick_batch_size(100, n), 32, 16, 16 }, - { pick_batch_size(100, n), 32, 8, 8 }, - { pick_batch_size(128, n), 256, 12, 12 }, - { pick_batch_size(128, n), 3, 231, 231 }, - { pick_batch_size(128, n), 512, 12, 12 }, - { pick_batch_size(256, n), 256, 13, 13 }, - { pick_batch_size(256, n), 3, 227, 227 }, - { pick_batch_size(256, n), 384, 13, 13 }, - { pick_batch_size(256, n), 96, 27, 27 }, - { pick_batch_size(32, n), 128, 28, 28 }, - { pick_batch_size(32, n), 144, 14, 14 }, - { pick_batch_size(32, n), 192, 28, 28 }, - { pick_batch_size(32, n), 192, 7, 7 }, - { pick_batch_size(32, n), 256, 28, 28 }, - { pick_batch_size(32, n), 3, 224, 224 }, - { pick_batch_size(32, n), 32, 28, 28 }, - { pick_batch_size(32, n), 48, 7, 7 }, - { pick_batch_size(32, n), 480, 128, 256 }, - { pick_batch_size(32, n), 480, 64, 128 }, - { pick_batch_size(32, n), 512, 4, 4 }, - { pick_batch_size(32, n), 512, 64, 128 }, - { pick_batch_size(16, n), 64, 56, 56 }, - { pick_batch_size(32, n), 832, 7, 7 }, - { pick_batch_size(64, n), 128, 56, 56 }, - { pick_batch_size(64, n), 256, 28, 28 }, - { pick_batch_size(64, n), 3, 224, 224 }, - { pick_batch_size(64, n), 512, 28, 28 }, - { pick_batch_size(64, n), 64, 112, 112 }, - { pick_batch_size(32, n), 64, 14, 14 }, - { pick_batch_size(32, n), 192, 14, 14 }, - { pick_batch_size(32, n), 320, 28, 28 }, - { pick_batch_size(32, n), 576, 14, 14 }, - { pick_batch_size(32, n), 576, 4, 4 }, - { pick_batch_size(32, n), 1056, 7, 7 }, - { pick_batch_size(32, n), 2048, 11, 11 }, -#if MIOPEN_TESTS_GET_INPUTS_ENABLE_HUGE_TENSORS - { pick_batch_size(32, n), 16, 2048, 2048 }, - { pick_batch_size(32, n), 16, 3072, 3072 }, - { pick_batch_size(32, n), 16, 4096, 4096 }, -#endif - { 1, 1, 1, 1 } - }; - // clang-format on -} - -template -inline std::set> get_weights(T n = MIOPEN_TEST_DEFAULT_BATCH_SIZE_FACTOR) -{ - // clang-format off - return - { - { pick_batch_size(1024, n),1024, 3, 3 }, - { pick_batch_size(1024, n),512, 3, 3 }, - { pick_batch_size(128, n), 256, 1, 1 }, - { pick_batch_size(128, n), 528, 1, 1 }, - { pick_batch_size(128, n), 96, 3, 3 }, - { pick_batch_size(16, n), 192, 1, 1 }, - { pick_batch_size(224, n), 112, 3, 3 }, - { pick_batch_size(256, n), 96, 5, 5 }, - { pick_batch_size(288, n), 144, 3, 3 }, - { pick_batch_size(48, n), 832, 1, 1 }, - { pick_batch_size(512, n), 256, 3, 3 }, - { pick_batch_size(64, n), 1, 2, 2 }, - { pick_batch_size(64, n), 3, 3, 3 }, - { pick_batch_size(64, n), 3, 7, 7 }, - { pick_batch_size(64, n), 32, 5, 5 }, - { pick_batch_size(64, n), 480, 1, 1 }, - { pick_batch_size(64, n), 64, 1, 1 }, - { pick_batch_size(96, n), 3, 11, 11 }, - { pick_batch_size(192, n), 64, 5, 5 }, - { pick_batch_size(64, n), 64, 3, 3 }, - { pick_batch_size(224, n), 224, 3, 3 }, - { pick_batch_size(224, n), 192, 3, 3 }, - { pick_batch_size(128, n), 320, 1, 1 }, - { pick_batch_size(192, n), 576, 1, 1 }, - { pick_batch_size(128, n), 1056, 1, 1 }, - { pick_batch_size(128, n), 1024, 1, 1 }, - { pick_batch_size(512, n), 2048, 1, 1 } - }; - // clang-format on -} - -template -inline std::set> get_immed_inputs(T n = MIOPEN_TEST_DEFAULT_BATCH_SIZE_FACTOR) -{ - // clang-format off - return - { - { pick_batch_size(32, n), 1, 14, 14 }, - { pick_batch_size(256, n), 1, 27, 27 }, - { pick_batch_size(128, n), 512, 12, 12 }, - { pick_batch_size(256, n), 256, 13, 13 }, - { pick_batch_size(256, n), 3, 227, 227 }, - { pick_batch_size(32, n), 64, 56, 56 }, - { pick_batch_size(32, n), 96, 14, 14 }, - { pick_batch_size(32, n), 96, 28, 28 }, - { pick_batch_size(64, n), 128, 56, 56 }, - { pick_batch_size(64, n), 3, 224, 224 }, - { pick_batch_size(64, n), 256, 14, 14 }, - { 1, 1, 1, 1 } - }; - // clang-format on -} - -template -inline std::set> get_immed_weights(T n = MIOPEN_TEST_DEFAULT_BATCH_SIZE_FACTOR) -{ - // clang-format off - return - { - { pick_batch_size(208, n), 96, 3, 3 }, - { pick_batch_size(24, n), 512, 1, 1 }, - { pick_batch_size(256, n), 128, 3, 3 }, - { pick_batch_size(256, n), 256, 3, 3 }, - { pick_batch_size(256, n), 64, 5, 5 }, - { pick_batch_size(288, n), 144, 3, 3 }, - { pick_batch_size(96, n), 3, 11, 11 }, - { pick_batch_size(32, n), 128, 5, 5 }, - { pick_batch_size(32, n), 128, 1, 1 }, - { pick_batch_size(256, n), 256, 3, 3 }, - { pick_batch_size(512, n), 512, 3, 3 }, - { pick_batch_size(160, n), 128, 3, 3 }, - { pick_batch_size(32, n), 3, 7, 7 } - }; - // clang-format on -} - -template -inline std::set> -get_3d_conv_input_shapes(T n = MIOPEN_TEST_DEFAULT_BATCH_SIZE_FACTOR) -{ - // clang-format off - return - { - { pick_batch_size(128, n), 1, 1, 2, 2}, - { pick_batch_size(128, n), 64, 1, 1, 1}, - { pick_batch_size(128, n), 64, 3, 4, 4}, - { pick_batch_size(352, n), 32, 4, 9, 9}, - { pick_batch_size(192, n), 512, 3, 14, 14}, - { pick_batch_size(352, n), 512, 4, 28, 28}, - { pick_batch_size(256, n), 512, 4, 56, 56}, - { pick_batch_size(192, n), 3, 4, 227, 227}, - { pick_batch_size(128, n), 4, 4, 161, 700} - }; - // clang-format on -} - -template -inline std::set> -get_3d_conv_weight_shapes(T n = MIOPEN_TEST_DEFAULT_BATCH_SIZE_FACTOR) -{ - // clang-format off - return - { - { pick_batch_size( 128, n), 1, 1, 1, 1}, - { pick_batch_size( 352, n), 128, 1, 1, 1}, - { pick_batch_size( 256, n), 128, 1, 1, 1}, - { pick_batch_size( 352, n), 32, 3, 3, 3}, - { pick_batch_size( 352, n), 4, 3, 3, 3}, - { pick_batch_size( 160, n), 4, 3, 5, 5}, - { pick_batch_size( 128, n), 64, 5, 7, 7}, - { pick_batch_size( 192, n), 4, 3, 11, 11}, - { pick_batch_size( 128, n), 1, 3, 1, 7}, - { pick_batch_size( 128, n), 1, 3, 7, 1}, - { pick_batch_size( 128, n), 1, 3, 5, 20} - }; - // clang-format on -} - -template -inline std::set> get_bn_peract_inputs(T n = MIOPEN_TEST_DEFAULT_BATCH_SIZE_FACTOR) -{ - // clang-format off - return - { - { pick_batch_size(32, n), 4, 1024,2048}, //Making this much smaller - { pick_batch_size(100, n), 3, 32, 32 }, - { pick_batch_size(100, n), 32, 8, 8 }, - { pick_batch_size(128, n), 256, 12, 12 }, - { pick_batch_size(256, n), 3, 227, 227 }, - { pick_batch_size(64, n), 64, 112, 112 },//Batch-norm ResNet 152 after this line - { pick_batch_size(256, n), 1024, 14, 14 },// n is from the paper @ 256 - { pick_batch_size(256, n), 2048, 7, 7 }, - { pick_batch_size(256, n), 256, 56, 56 }, - { pick_batch_size(256, n), 256, 14, 14 }, - { pick_batch_size(256, n), 512, 28, 28 }, - { pick_batch_size(256, n), 512, 7, 7 }, - { pick_batch_size(256, n), 64, 112, 112 }, - { pick_batch_size(256, n), 64, 56, 56 },//Batch-norm Inception_v3 after this - { pick_batch_size(32, n), 1024, 1, 1 },// n is from the paper @ 32 - { pick_batch_size(32, n), 128, 14, 14 }, - { pick_batch_size(32, n), 128, 28, 28 }, - { pick_batch_size(32, n), 128, 4, 4 }, - { pick_batch_size(32, n), 128, 7, 7 }, - { pick_batch_size(32, n), 160, 7, 7 }, - { pick_batch_size(32, n), 192, 14, 14 }, - { pick_batch_size(32, n), 192, 56, 56 }, - { pick_batch_size(32, n), 192, 7, 7 }, - { pick_batch_size(32, n), 224, 14, 14 }, - { pick_batch_size(32, n), 256, 7, 7 }, - { pick_batch_size(32, n), 256, 14, 14 }, - { pick_batch_size(32, n), 352, 7, 7 }, - { pick_batch_size(32, n), 64, 112, 112 }, - { pick_batch_size(32, n), 64, 14, 14 }, - { pick_batch_size(32, n), 64, 56, 56 }, - { pick_batch_size(32, n), 96, 28, 28 }, - { pick_batch_size(32, n), 32, 256, 512 }, //Killing this config. Takes way too long on the CPU - { pick_batch_size(32, n), 256, 28, 28 }, - { pick_batch_size(32, n), 3, 224, 224 }, - { pick_batch_size(32, n), 480, 128, 256 }, - { pick_batch_size(32, n), 528, 64, 128 } - }; - // clang-format on -} - -template -inline std::set> get_bn_spatial_inputs(T n = MIOPEN_TEST_DEFAULT_BATCH_SIZE_FACTOR) -{ - // clang-format off - return - { - { pick_batch_size(32, n), 4, 1024,2048}, //Making this much smaller - { pick_batch_size(32, n), 192, 256, 512 }, - { pick_batch_size(32, n), 480, 128, 256 }, - { pick_batch_size(256, n), 3, 227, 227 }, - { pick_batch_size(256, n), 64, 112, 112 }, - { pick_batch_size(512, n), 16, 32, 32 }, - { pick_batch_size(100, n), 32, 8, 8 }, - { pick_batch_size(128, n), 256, 12, 12 }, - { pick_batch_size(256, n), 128, 28, 28 }, - { pick_batch_size(256, n), 2048, 7, 7 }, - { pick_batch_size(256, n), 256, 56, 56 }, - { pick_batch_size(256, n), 256, 14, 14 }, - { pick_batch_size(256, n), 512, 28, 28 }, - { pick_batch_size(256, n), 512, 7, 7 }, - { pick_batch_size(256, n), 64, 56, 56 },//Batch-norm Inception_v3 after this - { pick_batch_size(32, n), 1024, 1, 1 },// n is from the paper @ 32 - { pick_batch_size(32, n), 128, 14, 14 }, - { pick_batch_size(32, n), 128, 4, 4 }, - { pick_batch_size(32, n), 160, 7, 7 }, - { pick_batch_size(32, n), 192, 14, 14 }, - { pick_batch_size(32, n), 192, 56, 56 }, - { pick_batch_size(32, n), 192, 7, 7 }, - { pick_batch_size(32, n), 224, 14, 14 }, - { pick_batch_size(32, n), 256, 7, 7 }, - { pick_batch_size(32, n), 352, 7, 7 }, - { pick_batch_size(32, n), 64, 14, 14 }, - { pick_batch_size(32, n), 64, 28, 28 }, - { pick_batch_size(32, n), 64, 56, 56 }, - { pick_batch_size(32, n), 96, 28, 28 }, - { pick_batch_size(32, n), 192, 256, 512 }, - { pick_batch_size(32, n), 256, 28, 28 }, - { pick_batch_size(32, n), 3, 224, 224 }, - { pick_batch_size(32, n), 480, 128, 256 }, - { pick_batch_size(32, n), 528, 64, 128 }, - { pick_batch_size(770, n), 1, 8, 8 }, - { pick_batch_size(770, n), 1024, 1, 1 }, - { pick_batch_size(152, n), 128, 80, 80 }, - { pick_batch_size(152, n), 256, 20, 20 }, - { pick_batch_size(152, n), 32, 160, 160 }, - { pick_batch_size(152, n), 512, 20, 20 }, - { pick_batch_size(152, n), 64, 160, 160 }, - { pick_batch_size(152, n), 64, 80, 80 }, - { pick_batch_size(256, n), 256, 20, 20 }, - { pick_batch_size(256, n), 512, 20, 20 } - }; - // clang-format on -} - -template -inline std::set> get_3d_bn_peract_inputs(T n = MIOPEN_TEST_DEFAULT_BATCH_SIZE_FACTOR) -{ - // clang-format off - return - { - { pick_batch_size(32, n), 1, 32, 32, 32 }, // 32x32x32 based on VoxNet arch - { pick_batch_size(32, n), 1, 14, 14, 14 }, - { pick_batch_size(32, n), 32, 14, 14, 14 }, - { pick_batch_size(32, n), 32, 12, 12, 12 }, - { pick_batch_size(32, n), 32, 6, 6, 6 }, - { pick_batch_size(256, n), 1, 32, 32, 32 }, // 32x32x32 based on VoxNet arch - { pick_batch_size(256, n), 32, 14, 14, 14 }, - { pick_batch_size(256, n), 32, 12, 12, 12 }, - { pick_batch_size(256, n), 32, 6, 6, 6 }, - { pick_batch_size(512, n), 1, 32, 32, 32 }, // 32x32x32 based on VoxNet arch - { pick_batch_size(512, n), 32, 14, 14, 14 }, - { pick_batch_size(512, n), 32, 12, 12, 12 }, - { pick_batch_size(512, n), 32, 6, 6, 6 }, - { pick_batch_size(32, n), 2, 32, 57, 125 }, // Hand-gesture recognition CVPR 2015 paper High Res Net Path - { pick_batch_size(32, n), 32, 14, 25, 59 }, - { pick_batch_size(32, n), 32, 6, 10, 27 }, - { pick_batch_size(32, n), 32, 4, 6, 11 }, - { pick_batch_size(32, n), 32, 2, 2, 3 }, - { pick_batch_size(32, n), 32, 32, 28, 62 }, // Hand-gesture recognition CVPR 2015 paper Low Res Net Path - { pick_batch_size(32, n), 32, 14, 12, 29 }, - { pick_batch_size(32, n), 32, 6, 4, 12 }, - { pick_batch_size(32, n), 32, 4, 2, 2 }, - { pick_batch_size(16, n), 32, 6, 50, 50 }, // Multi-view 3D convnet - { pick_batch_size(1, n), 3, 8, 240, 320 }, // 3D convet on video - { pick_batch_size(1, n), 3, 16, 240, 320 }, // 3D convet on video - { pick_batch_size(1, n), 3, 8, 128, 171 }, // 3D convet on video - { pick_batch_size(1, n), 3, 16, 128, 171 }, // 3D convet on video - { pick_batch_size(1, n), 3, 8, 112, 112 }, // 3D convet on video - { pick_batch_size(1, n), 3, 16, 112, 112 } // 3D convet on video - }; - - // clang-format on -} - -template -inline std::set> -get_3d_bn_spatial_inputs(T n = MIOPEN_TEST_DEFAULT_BATCH_SIZE_FACTOR) -{ - // clang-format off - return - { - { pick_batch_size(32, n), 1, 32, 32, 32 }, // 32x32x32 based on VoxNet arch - { pick_batch_size(32, n), 1, 14, 14, 14 }, - { pick_batch_size(32, n), 32, 14, 14, 14 }, - { pick_batch_size(32, n), 32, 12, 12, 12 }, - { pick_batch_size(32, n), 32, 6, 6, 6 }, - { pick_batch_size(256, n), 1, 32, 32, 32 }, // 32x32x32 based on VoxNet arch - { pick_batch_size(256, n), 32, 14, 14, 14 }, - { pick_batch_size(256, n), 32, 12, 12, 12 }, - { pick_batch_size(256, n), 32, 6, 6, 6 }, - { pick_batch_size(512, n), 1, 32, 32, 32 }, // 32x32x32 based on VoxNet arch - { pick_batch_size(512, n), 32, 14, 14, 14 }, - { pick_batch_size(512, n), 32, 12, 12, 12 }, - { pick_batch_size(512, n), 32, 6, 6, 6 }, - { pick_batch_size(32, n), 2, 32, 57, 125 }, // Hand-gesture recognition CVPR 2015 paper High Res Net Path - { pick_batch_size(32, n), 32, 14, 25, 59 }, - { pick_batch_size(32, n), 32, 6, 10, 27 }, - { pick_batch_size(32, n), 32, 4, 6, 11 }, - { pick_batch_size(32, n), 32, 2, 2, 3 }, - { pick_batch_size(32, n), 32, 32, 28, 62 }, // Hand-gesture recognition CVPR 2015 paper Low Res Net Path - { pick_batch_size(32, n), 32, 14, 12, 29 }, - { pick_batch_size(32, n), 32, 6, 4, 12 }, - { pick_batch_size(32, n), 32, 4, 2, 2 }, - { pick_batch_size(16, n), 32, 6, 50, 50 }, // Multi-view 3D convnet - { pick_batch_size(1, n), 3, 8, 240, 320 }, // 3D convet on video - { pick_batch_size(1, n), 3, 16, 240, 320 }, // 3D convet on video - { pick_batch_size(1, n), 3, 8, 128, 171 }, // 3D convet on video - { pick_batch_size(1, n), 3, 16, 128, 171 }, // 3D convet on video - { pick_batch_size(1, n), 3, 8, 112, 112 }, // 3D convet on video - { pick_batch_size(1, n), 3, 16, 112, 112 } // 3D convet on video - }; - // clang-format on -} - -template -inline std::vector> get_sub_tensor() -{ - return {{16, 4, 8, 1, 4}, - {2, 4, 8, 8, 4}, - {16, 4, 8, 4}, - {13, 8, 4, 8}, - {3, 8, 7}, - {16, 4, 10}, - {3, 8}, - {16, 4}, - {4}}; -} - -template -inline std::vector> get_tensor_offsets() -{ - static_assert(std::is_signed_v); - return {{0, 0}, {0, 2}, {4, 0}, {5, 7}}; -} - -template -inline std::vector get_tensor_offset() -{ - static_assert(std::is_signed_v); - return {0, 1, 2, 3, 4, 5}; -} - +#include #endif diff --git a/projects/miopen/test/random.hpp b/projects/miopen/test/random.hpp index 62443abb1068..7c5c0efa5962 100644 --- a/projects/miopen/test/random.hpp +++ b/projects/miopen/test/random.hpp @@ -1,62 +1,5 @@ -/******************************************************************************* - * - * MIT License - * - * Copyright (c) 2021 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - *******************************************************************************/ +// Forwarding header — implementation moved to miopen_utils. #ifndef GUARD_MIOPEN_TEST_RANDOM_HPP #define GUARD_MIOPEN_TEST_RANDOM_HPP - -#include "../driver/random.hpp" - -namespace prng { -template -inline T gen_descreet_uniform_sign(double scale, int32_t range) -{ - return static_cast(scale * prng::gen_A_to_B(-range + 1, range)); -} - -template -inline T gen_descreet_unsigned(double scale, int32_t range) -{ - return static_cast(scale * static_cast(gen_0_to_B(range))); -} - -} // namespace prng - -// lambda factory -template -auto uniform_signed_initializer(ScaleT scale_arg, RangeT range_arg) -{ - return [=](auto&&...) -> T { - // uniform sign give balance of both negative and positive values - return prng::gen_descreet_uniform_sign(scale_arg, range_arg); - }; -} - -template -auto uniform_unsigned_initializer(ScaleT scale_arg, RangeT range_arg) -{ - return [=](auto&&...) -> T { return prng::gen_descreet_unsigned(scale_arg, range_arg); }; -} - -#endif // GUARD_MIOPEN_TEST_RANDOM_HPP +#include +#endif diff --git a/projects/miopen/test/rnn_util.hpp b/projects/miopen/test/rnn_util.hpp index d993d0df4c57..2a25f35e61a8 100644 --- a/projects/miopen/test/rnn_util.hpp +++ b/projects/miopen/test/rnn_util.hpp @@ -1,305 +1,5 @@ -/******************************************************************************* - * - * MIT License - * - * Copyright (c) 2023 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - *******************************************************************************/ - +// Forwarding header — implementation moved to miopen_utils. #ifndef MIOPEN_RNN_UTIL_H_ #define MIOPEN_RNN_UTIL_H_ - -#include -#include -#include -#include -#include -#include -#include - -#include "gemm.hpp" -#include "random.hpp" - -#include - -// complexity O(NlogN) -inline std::vector GetReverseOrderIndex(const std::vector& base_index) -{ - std::vector reverse_index(base_index.size()); - unsigned next_rev_index = 0; - for(auto id : base_index) - reverse_index[id] = next_rev_index++; - return reverse_index; -}; - -inline std::vector GetSamplesIndexDescendingOrder(const std::vector& unsorted_seq_lens) -{ - const auto sample_count = unsorted_seq_lens.size(); - - std::vector index_v(sample_count); - std::iota(index_v.begin(), index_v.end(), 0); - - auto seq_len_cmp = [&unsorted_seq_lens](unsigned a_id, unsigned b_id) { - return unsorted_seq_lens[a_id] > unsorted_seq_lens[b_id]; - }; - - std::stable_sort(index_v.begin(), index_v.end(), seq_len_cmp); - - return index_v; -} - -template -inline void HiddenTensorReorder(const std::vector& src_array, - std::vector& dst_array, - const std::vector& batch_order, - const std::vector hid_len, - bool is_dst_direct_order) -{ - const size_t copy_size = hid_len[2]; - - const size_t batch_stride = hid_len[2]; - const size_t layer_stride = batch_stride * hid_len[1]; - - for(size_t batch_id = 0; batch_id < hid_len[1]; batch_id++) - { - const auto src_batch_off = - batch_stride * (is_dst_direct_order ? batch_order[batch_id] : batch_id); - const auto dst_batch_off = - batch_stride * (is_dst_direct_order ? batch_id : batch_order[batch_id]); - - for(size_t layer_id = 0; layer_id < hid_len[0]; layer_id++) - { - const auto dst_offset = dst_batch_off + layer_id * layer_stride; - const auto src_offset = src_batch_off + layer_id * layer_stride; - - std::copy(src_array.begin() + src_offset, - src_array.begin() + src_offset + copy_size, - dst_array.begin() + dst_offset); - } - } -} - -inline void createTensorDescArray(std::vector& td, - std::vector& ptd, - const std::vector bs, - const int secondDim, - miopenDataType_t dataType) -{ - - std::transform(bs.begin(), bs.end(), std::back_inserter(td), [&](int x) { - return miopen::TensorDescriptor( - dataType, {static_cast(x), static_cast(secondDim)}); - }); - std::transform(td.begin(), td.end(), std::back_inserter(ptd), [](miopen::TensorDescriptor& x) { - return &x; - }); -} - -inline std::tuple -GetTempPackedBuffersSize(std::vector batchs, int in_vec, int out_vec) -{ - size_t total_batch = std::accumulate(batchs.begin(), batchs.end(), 0ULL); - - size_t in_buff_size = total_batch * in_vec; - size_t out_buff_size = total_batch * out_vec; - return {in_buff_size, out_buff_size}; -} - -inline size_t getSuperTensorSize(const std::vector& bs, - int seqLength, - int inputSize, - int hiddenSize, - int maxPaddingVal, - bool isBidirect, - bool isInput, - bool isPadded) -{ - return (isPadded // - ? static_cast(seqLength) * maxPaddingVal - : std::accumulate(bs.begin(), bs.end(), 0ULL)) // - * (isInput // - ? static_cast(inputSize) - : static_cast(hiddenSize) * (isBidirect ? 2 : 1)); -} - -template -void ChangeDataPadding(const std::vector& src_array, - std::vector& dst_array, - const std::vector& batch_list, - int max_batch, - int sample_size, - bool is_src_packed) -{ - auto seq_len = batch_list.size(); - - auto scr_ptr = &src_array[0]; - auto dst_ptr = &dst_array[0]; - - for(int seq_id = 0; seq_id < seq_len; seq_id++) - { - auto packed_size = batch_list[seq_id] * sample_size; - - std::copy(scr_ptr, scr_ptr + packed_size, dst_ptr); - - if(is_src_packed) - { - dst_ptr += max_batch * sample_size; - scr_ptr += packed_size; - } - else - { - scr_ptr += max_batch * sample_size; - dst_ptr += packed_size; - } - } -} - -// RNN VANILLA configs -inline std::vector get_rnn_num_layers() { return {{1, 3}}; } - -inline std::vector get_rnn_batchSize() { return {{1, 17}}; } - -inline std::vector get_rnn_seq_len() { return {{1, 3, 51}}; } - -inline std::vector get_rnn_vector_len() { return {31}; } - -inline std::vector get_rnn_hidden_size() { return {127}; } - -// LSTM configs -inline std::vector get_lstm_num_layers() { return {{1, 3}}; } - -inline std::vector get_lstm_batchSize() { return {{1, 17}}; } - -inline std::vector get_lstm_seq_len() { return {{1, 25}}; } - -inline std::vector get_lstm_vector_len() { return {17}; } - -inline std::vector get_lstm_hidden_size() { return {67}; } - -// GRU configs -inline std::vector get_gru_num_layers() { return {{1, 3}}; } - -inline std::vector get_gru_batchSize() { return {{1, 17}}; } - -inline std::vector get_gru_seq_len() { return {{1, 23}}; } - -inline std::vector get_gru_vector_len() { return {13}; } - -inline std::vector get_gru_hidden_size() { return {67}; } - -inline std::vector> generate_batchSeq(const int batchSize, const int seqLength) -{ - - static constexpr int modval = 3; - - int currentval = batchSize; - std::vector batchSeq; - batchSeq.reserve(seqLength); - for(int i = 0; i < seqLength; i++) - { - if(i > 0) - { - int nvalue = currentval - prng::gen_0_to_B(modval); - currentval = (nvalue < 1) ? 1 : nvalue; - // printf("current value: %d\n", currentval); - } - // printf("adding a value to batch sequence: %d\n", currentval); - batchSeq.push_back(currentval); - } - return {batchSeq}; -} - -inline int sumvc(const std::vector& x) { return std::accumulate(x.begin(), x.end(), 0); } - -template -inline T activfunc(T x, int actvf) -{ - T alpha = static_cast(1), beta0 = static_cast(0), beta1 = static_cast(1); - if(actvf == 0) - { - return (x > 0) ? x : x * beta0; - } - else if(actvf == 2) - { - return static_cast(1 / (1 + std::exp(-x))); - } - return static_cast(alpha * std::tanh(beta1 * x)); -} - -template -inline T dervactivfunc(T x, int actvf) -{ - if(actvf == 0) - { - return static_cast(x > 0 ? 1 : 0); - } - else if(actvf == 2) - { - return static_cast(std::exp(-x) / (1 + std::exp(-x)) / (1 + std::exp(-x))); - } - - return static_cast(1 / std::cosh(x) / std::cosh(x)); -} - -template -void RNN_mm_cpu_batched(const Dtype* a_ptr, - size_t a_cols, - size_t a_rows, - size_t lda, - size_t a_stride, - int a_flags, - const Dtype* b_ptr, - size_t b_cols, - size_t b_rows, - size_t ldb, - size_t b_stride, - int b_flags, - Dtype* c_ptr, - size_t c_cols, - size_t c_rows, - size_t ldc, - size_t c_stride, - int batchCount, - double alpha, - double beta) -{ - for(int i = 0; i < batchCount; ++i) - { - gemm_cpu(a_ptr + a_stride * i, - a_cols, - a_rows, - lda, - a_flags == 1 ? true : false, - b_ptr + b_stride * i, - b_cols, - b_rows, - ldb, - b_flags == 1 ? true : false, - c_ptr + c_stride * i, - c_cols, - c_rows, - ldc, - alpha, - beta); - } -} - +#include #endif diff --git a/projects/miopen/test/serialize.hpp b/projects/miopen/test/serialize.hpp index 6b9b1b29632e..b9e948307a1e 100644 --- a/projects/miopen/test/serialize.hpp +++ b/projects/miopen/test/serialize.hpp @@ -1,129 +1,5 @@ -/******************************************************************************* - * - * MIT License - * - * Copyright (c) 2018 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - *******************************************************************************/ - +// Forwarding header — implementation moved to miopen_utils. #ifndef MIOPEN_GUARD_TEST_SERIALIZE_HPP #define MIOPEN_GUARD_TEST_SERIALIZE_HPP - -#include -#include -#include -#include -#include -#include -#include - -template -struct is_trivial_serializable : std::is_trivially_copy_constructible -{ -}; - -template <> -struct is_trivial_serializable : std::true_type -{ -}; - -template -std::enable_if_t{}> serialize(std::ostream& os, const T& x) -{ - os.write(reinterpret_cast(&x), sizeof(T)); -} - -template -auto serialize(std::ostream& os, - const T& x) -> decltype(x.begin(), x.end(), T(x.begin(), x.end()), void()) -{ - std::size_t n = std::distance(x.begin(), x.end()); - serialize(os, n); - for(auto&& y : x) - serialize(os, y); -} - -template -std::enable_if_t>{}> -serialize(std::ostream& os, const std::tuple& t) -{ - miopen::unpack( - [&](auto&&... xs) { miopen::each_args([&](auto&& x) { serialize(os, x); }, xs...); }, t); -} - -template -std::enable_if_t{}> serialize(std::istream& is, T& x) -{ - is.read(reinterpret_cast(&x), sizeof(T)); -} - -template -std::enable_if_t{}> serialize(std::istream& is, std::vector& x) -{ - std::size_t n; - serialize(is, n); - x.resize(n); - is.read(reinterpret_cast(x.data()), sizeof(T) * n); -} - -template -auto serialize(std::istream& is, - T& x) -> decltype(x.begin(), x.end(), x.assign(x.begin(), x.end()), void()) -{ - using value_type = std::decay_t; - std::size_t n; - serialize(is, n); - std::vector v; - v.reserve(n); - for(std::size_t i = 0; i < n; i++) - { - value_type y; - serialize(is, y); - v.push_back(y); - } - x.assign(v.begin(), v.end()); -} - -template -std::enable_if_t>{}> -serialize(std::istream& is, - // cppcheck-suppress constParameter - std::tuple& t) -{ - miopen::unpack( - [&](auto&&... xs) { miopen::each_args([&](auto&& x) { serialize(is, x); }, xs...); }, t); -} - -template -void load(std::string name, T& x) -{ - std::ifstream is{name.c_str()}; - serialize(is, x); -} - -template -void save(std::string name, const T& x) -{ - std::ofstream os{name.c_str()}; - serialize(os, x); -} - +#include #endif diff --git a/projects/miopen/test/tensor_holder.hpp b/projects/miopen/test/tensor_holder.hpp index 64be2aa7c851..5f075eb9b528 100644 --- a/projects/miopen/test/tensor_holder.hpp +++ b/projects/miopen/test/tensor_holder.hpp @@ -1,505 +1,5 @@ -/******************************************************************************* - * - * MIT License - * - * Copyright (c) 2017 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - *******************************************************************************/ +// Forwarding header — implementation moved to miopen_utils. #ifndef GUARD_TENSOR_HOLDER_HPP #define GUARD_TENSOR_HOLDER_HPP - -#include "network_data.hpp" -#include -#include -#include -#include -#include -#include -#include "../driver/random.hpp" - -#include "serialize.hpp" - -#include -using half = half_float::half; -using hip_bfloat16 = bfloat16; -#include "../../src/kernels/hip_float8.hpp" -using float8_fnuz = miopen_f8::hip_f8; -using bfloat8_fnuz = miopen_f8::hip_f8; - -#include -#include - -template -void visit_tensor_size(std::size_t n, F f) -{ - switch(n) - { - case 0: { - f(std::integral_constant{}); - break; - } - case 1: { - f(std::integral_constant{}); - break; - } - case 2: { - f(std::integral_constant{}); - break; - } - case 3: { - f(std::integral_constant{}); - break; - } - case 4: { - f(std::integral_constant{}); - break; - } - case 5: { - f(std::integral_constant{}); - break; - } - default: throw std::runtime_error("Unknown tensor size"); - } -} - -template -struct miopen_type; - -template <> -struct miopen_type : std::integral_constant -{ -}; - -template <> -struct miopen_type : std::integral_constant -{ -}; - -template <> -struct miopen_type : std::integral_constant -{ -}; -template <> -struct miopen_type : std::integral_constant -{ -}; - -template <> -struct miopen_type : std::integral_constant -{ -}; - -template <> -struct miopen_type : std::integral_constant -{ -}; - -template <> -struct miopen_type : std::integral_constant -{ -}; - -template <> -struct miopen_type : std::integral_constant -{ -}; - -template <> -struct miopen_type : std::integral_constant -{ -}; - -template <> -struct miopen_type : std::integral_constant -{ -}; - -template <> -struct miopen_type : std::integral_constant -{ -}; - -template <> -struct miopen_type : std::integral_constant -{ -}; - -template -struct tensor -{ - using value_type = T; - miopen::TensorDescriptor desc; - std::vector data; - -#if defined(__clang__) || defined(__GNUG__) -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wdeprecated-declarations" -#endif - - tensor() : desc(miopen_type{}) {} - -#if defined(__clang__) || defined(__GNUG__) -#pragma GCC diagnostic pop -#endif - - template - tensor(const std::vector& dims) : desc(miopen_type{}, dims), data(desc.GetElementSpace()) - { - } - - template - tensor(const std::vector& dims, const std::vector& strides) - : desc(miopen_type{}, dims, strides), data(desc.GetElementSpace()) - { - assert(dims.size() == strides.size()); - } - - template - tensor(miopenTensorLayout_t layout, const std::vector& dims) - : desc(miopen_type{}, layout, dims), data(desc.GetElementSpace()) - { - } - - template - tensor(miopenTensorLayout_t layout, const std::vector& dims, const std::vector& strides) - : desc(miopen_type{}, layout, dims, strides), data(desc.GetElementSpace()) - { - assert(dims.size() == strides.size()); - } - - tensor(std::size_t n, std::size_t c, std::size_t h, std::size_t w) - : desc(miopen_type{}, {n, c, h, w}), data(n * c * h * w) - { - } - - tensor(miopenTensorLayout_t layout, std::size_t n, std::size_t c, std::size_t h, std::size_t w) - : desc(miopen_type{}, layout, {n, c, h, w}), data(desc.GetElementSpace()) - { - } - - tensor(std::size_t n, std::size_t c, std::size_t d, std::size_t h, std::size_t w) - : desc(miopen_type{}, {n, c, d, h, w}), data(n * c * d * h * w) - { - } - - tensor(std::size_t n) : desc(miopen_type{}, {n}), data(n) {} - - tensor(miopen::TensorDescriptor rhs) : desc(std::move(rhs)) - { - assert(desc.GetType() == miopen_type{} - /// In the driver, T is input tensor type, but output tensor holders - /// are instantiatied with T as well. This leads to false assertion - /// failures when T is INT8 because output type is different. - /// \todo Get rid of this hack when the driver is improved: - || (miopen_type{} == miopenInt8 && desc.GetType() == miopenInt32)); - data.resize(desc.GetElementSpace()); - } - - size_t GetDataByteSize() const { return GetSize() * sizeof(T); } - - size_t GetSize() const { return desc.GetElementSpace(); } - - template - tensor& generate(G g) & - { - if(this->desc.GetVectorLength() > 1) - this->generate_vect_impl(g); - else - this->generate_impl(g); - return *this; - } - - template - tensor&& generate(G g) && - { - if(this->desc.GetVectorLength() > 1) - this->generate_vect_impl(g); - else - this->generate_impl(g); - return std::move(*this); - } - - template - void generate_impl(G g) - { - auto seed = std::accumulate(desc.GetLengths().begin(), - desc.GetLengths().end(), - std::size_t{521288629}, - [](auto x, auto y) { - x ^= x << 1U; - return x ^ y; - }); - seed ^= data.size(); - seed ^= desc.GetLengths().size(); - prng::reset_seed(seed); - auto iterator = data.begin(); - auto assign = [&](T x) { - *iterator = x; - ++iterator; - }; - this->for_each( - miopen::compose(miopen::compose(assign, miopen::cast_to()), std::move(g))); - } - - template - void generate_vect_impl(G g) - { - auto seed = std::accumulate(desc.GetLengths().begin(), - desc.GetLengths().end(), - std::size_t{521288629}, - [](auto x, auto y) { - x ^= x << 1U; - return x ^ y; - }); - seed ^= data.size(); - seed ^= desc.GetLengths().size(); - prng::reset_seed(seed); - auto iterator = data.begin(); - auto vectorLength = desc.GetVectorLength(); - auto assign = [&](T x) { - assert(iterator < data.end()); - // for debugging - for(auto i = 0; i < vectorLength; i++) - { - *(iterator + i) = x; - } - iterator += vectorLength; - }; - this->for_each( - miopen::compose(miopen::compose(assign, miopen::cast_to()), std::move(g))); - } - - template - struct for_each_unpacked - { - Loop loop; - F f; - template - auto operator()(Ts... xs) const -> decltype(f(xs...), void()) - { - loop(xs...)(std::move(f)); - } - - struct any - { - any() {} - template - any(X) - { - } - }; - - [[noreturn]] void operator()(any = {}, - any = {}, - any = {}, - any = {}, - any = {}, - any = {}, - any = {}, - any = {}, - any = {}) const - { - throw std::runtime_error( - "Arguments to for_each do not match tensor size or the function " + - miopen::get_type_name() + " can not be called."); - } - }; - - struct for_each_handler - { - template - void operator()(Self* self, Loop loop, F f, Size size) const - { - auto dims = miopen::tien(self->desc.GetLengths()); - miopen::unpack(for_each_unpacked{loop, std::move(f)}, dims); - } - }; - - template - void for_each(F f) const - { - visit_tensor_size( - desc.GetLengths().size(), - std::bind(for_each_handler{}, this, miopen::ford, std::move(f), std::placeholders::_1)); - } - - template - void par_for_each(F f) const - { - visit_tensor_size( - desc.GetLengths().size(), - std::bind( - for_each_handler{}, this, miopen::par_ford, std::move(f), std::placeholders::_1)); - } - - template - T& operator()(Ts... xs) - { - assert(this->desc.GetIndex(xs...) < data.size()); - return this->data[this->desc.GetIndex(xs...)]; - } - - template - const T& operator()(Ts... xs) const - { - assert(this->desc.GetIndex(xs...) < data.size()); - return this->data[this->desc.GetIndex(xs...)]; - } - - template - const T& operator()(const std::array& multi_id) const - { - auto f = [&](auto... is) { return this->desc.GetIndex(is...); }; - assert(miopen::unpack(f, multi_id) < data.size()); - return this->data[miopen::unpack(f, multi_id)]; - } - - T& operator[](std::size_t i) { return data.at(i); } - - const T& operator[](std::size_t i) const { return data.at(i); } - - typename std::vector::iterator begin() { return data.begin(); } - - typename std::vector::iterator end() { return data.end(); } - - typename std::vector::const_iterator begin() const { return data.begin(); } - - typename std::vector::const_iterator end() const { return data.end(); } - - friend std::ostream& operator<<(std::ostream& stream, const tensor& t) - { - return stream << t.desc; - } - - template - void dump_inner(size_t dim, std::array& coord, Stream& stream) const - { - const auto lengths = this->desc.GetLengths(); - if(lengths.size() == 0) - { - // 0D special case: Just print the one value that we have and return. - stream << (*this)(coord); - } - else if(dim + 1 == lengths.size()) - { - // 1D special case: dump everything on one line - for(size_t i = 0; i < lengths[dim]; ++i) - { - if(i != 0) - stream << ' '; - - coord[dim] = i; - stream << std::setw(4) << (*this)(coord); - } - - stream << '\n'; - } - else - { - if(dim + 2 == lengths.size()) - { - // 2D special case: Also print which 2D slice we are currently printing - // Note: this is not needed for higher dimensions, as they will also pass - // through this branch. - stream << "slice ["; - for(size_t i = 0; i < dim; ++i) - { - stream << coord[i] << ", "; - } - stream << ":, :]\n"; - } - - for(size_t i = 0; i < lengths[dim]; ++i) - { - coord[dim] = i; - this->dump_inner(dim + 1, coord, stream); - } - } - } - - template - void dump(const char* name, Stream& stream = std::cout) const - { - const auto n = this->desc.GetLengths().size(); - stream << "==== " << name << ": " << *this << n << '\n'; - stream.fill(' '); - - const auto flags = stream.flags(); - - visit_tensor_size(n, [&](const auto size) { - constexpr size_t N = decltype(size)::value; - std::array coord; - this->dump_inner(0, coord, stream); - }); - - stream.flags(flags); - } -}; - -template -void serialize(std::istream& s, tensor& x) -{ - std::vector lens; - serialize(s, lens); - std::vector strides; - serialize(s, strides); - x.desc = miopen::TensorDescriptor{miopen_type{}, lens, strides}; - serialize(s, x.data); -} - -template -void serialize(std::ostream& s, const tensor& x) -{ - const auto& lens = x.desc.GetLengths(); - const auto& strides = x.desc.GetStrides(); - serialize(s, lens); - serialize(s, strides); - serialize(s, x.data); -} - -struct tensor_generate -{ - template - Tensor&& operator()(Tensor&& t, G g) const - { - return std::forward(t.generate(g)); - } -}; - -struct tensor_elem_gen_integer -{ - uint64_t max_value = 17; - - template - double operator()(Ts... Xs) const - { - static_assert(sizeof...(Ts) < 6, - "Dimensions in tensor_elem_gen_integer must be less than 6."); - assert(max_value > 0); - std::array left = {{Xs...}}; - std::array right = {{613, 547, 701, 877, 1049}}; - uint64_t dot = - std::inner_product(left.begin(), left.end(), right.begin(), static_cast(173)); - return static_cast(dot % max_value); - } -}; - +#include #endif diff --git a/projects/miopen/test/verify.hpp b/projects/miopen/test/verify.hpp index 1d7d9cf80a50..2bf12f1057a3 100644 --- a/projects/miopen/test/verify.hpp +++ b/projects/miopen/test/verify.hpp @@ -1,245 +1,5 @@ -/******************************************************************************* - * - * MIT License - * - * Copyright (c) 2017 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - *******************************************************************************/ +// Forwarding header — implementation moved to miopen_utils. #ifndef GUARD_VERIFY_HPP #define GUARD_VERIFY_HPP - -#include -#include -#include -#include -#include -#include -#include -#include -using half = half_float::half; -using hip_bfloat16 = bfloat16; -#include -#include "tensor_holder.hpp" - -namespace miopen { - -// Compute the value of a range -template -using range_value = typename std::decay().begin())>::type; - -struct sum_fn -{ - template - auto operator()(T x, U y) const MIOPEN_RETURNS(x + y); -}; -static constexpr sum_fn sum{}; - -struct max_fn -{ - template - static T id(T x) - { - return x; - } - - template - auto operator()(T x, U y) const MIOPEN_RETURNS(max_fn::id(x > y ? x : y)); -}; -static constexpr max_fn max{}; - -namespace abs_diff_detail { -using std::fabs; -struct fn -{ - template - auto operator()(T x, U y) const MIOPEN_RETURNS(fabs(x - y)); -}; - -} // namespace abs_diff_detail - -static constexpr abs_diff_detail::fn abs_diff{}; - -struct not_finite_fn -{ - template ), bool>::type = false> - bool operator()(T x) const - { - return !std::isfinite(x); - } - - template ::type, half_float::half>), - bool>::type = false> - bool operator()(T x) const - { - return !half_float::isfinite(x); - } - - template ::type, bfloat16>), - bool>::type = false> - bool operator()(T x) const - { - return !std::isfinite(x); // bfloat16 has float() conversion operator - } - - template ), bool>::type = false> - bool operator()(T x) const - { - std::ignore = x; - return false; - } -}; -static constexpr not_finite_fn not_finite{}; - -template -T as(T, U x) -{ - return x; -} - -struct compare_mag_fn -{ - template - bool operator()(T x, U y) const - { - using std::fabs; - return fabs(x) < fabs(y); - } -}; -static constexpr compare_mag_fn compare_mag{}; - -struct square_diff_fn -{ - template - double operator()(T x, U y) const - { - double diff = static_cast(x - y); - return diff * diff; - } -}; -static constexpr square_diff_fn square_diff{}; - -template , bool> = true> -bool equal_values(T const& lhs, T const& rhs) -{ - return lhs == rhs; -} - -template , bool> = true> -bool equal_values(T const& lhs, T const& rhs) -{ - return miopen::float_equal_sentinel(lhs, rhs); -} - -template -bool range_empty(R1&& r1) -{ - return r1.begin() == r1.end(); -} - -template -auto range_distance(R1&& r1) MIOPEN_RETURNS(std::distance(r1.begin(), r1.end())); - -template -bool range_zero(const std::vector& r) -{ - return std::all_of(r.begin(), r.end(), [](T x) { return equal_values(x, T()); }); -} - -template -bool range_zero(const tensor& r) -{ - return range_zero(r.data); -} - -template -T range_product(R1&& r1, R2&& r2, T state, Reducer r, Product p) -{ - return std::inner_product(r1.begin(), r1.end(), r2.begin(), state, r, p); -} - -template -std::size_t mismatch_idx(R1&& r1, R2&& r2, Compare compare) -{ - auto p = std::mismatch(r1.begin(), r1.end(), r2.begin(), compare); - return std::distance(r1.begin(), p.first); -} - -template -int64_t find_idx(R1&& r1, Predicate p) -{ - auto it = std::find_if(r1.begin(), r1.end(), p); - if(it == r1.end()) - return -1; - else - return std::distance(r1.begin(), it); -} - -template -double max_diff(R1&& r1, R2&& r2) -{ - return range_product(r1, r2, 0.0, max, abs_diff); -} - -template -auto max_diff_v2(R1&& r1, R2&& r2) -{ - using T = decltype(r1[0] - r2[0]); - auto abs_diff_func = [](auto x, auto y) { return x > y ? x - y : y - x; }; - // BUG: deduced wrong datatype, half_float bug - if constexpr(std::is_same_v) - return range_product(r1, r2, half_float::half(), max, abs_diff_func); - else - return range_product(r1, r2, T(), max, abs_diff_func); -} - -template -std::size_t mismatch_diff(R1&& r1, R2&& r2, T diff) -{ - return mismatch_idx( - r1, - r2, - std::bind( - float_equal, diff, std::bind(abs_diff, std::placeholders::_1, std::placeholders::_2))); -} - -template -double rms_range(R1&& r1, R2&& r2) -{ - std::size_t n = range_distance(r1); - if(n == range_distance(r2)) - { - if(n == 0) - return 0; - double square_difference = range_product(r1, r2, 0.0, sum_fn{}, square_diff); - double mag1 = static_cast(*std::max_element(r1.begin(), r1.end(), compare_mag)); - double mag2 = static_cast(*std::max_element(r2.begin(), r2.end(), compare_mag)); - double mag = - std::max({std::fabs(mag1), std::fabs(mag2), std::numeric_limits::min()}); - return std::sqrt(square_difference) / (std::sqrt(n) * mag); - } - else - return double(std::numeric_limits>::max()); -} -} // namespace miopen +#include #endif From 9d8ed0aed7ef456e693e407400c057d395d9aa4b Mon Sep 17 00:00:00 2001 From: Brad Pepers Date: Sat, 9 May 2026 07:40:07 -0600 Subject: [PATCH 04/11] Remove unnecessary test includes of driver headers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove 15 unused #include directives where test files included driver headers without using any symbols from them: - 14 files included driver/tensor_driver.hpp unnecessarily - 1 file included driver/conv_common.hpp unnecessarily Remaining test→driver cross-references (3 files, all legitimate): - softmax_find20.cpp → mloSoftmaxHost.hpp (CPU reference, move later) - find_mode_trust_verify.cpp → driver.hpp (uses GPUMem) - kernel_tuning_net.cpp → driver.hpp (uses GPUMem) Co-Authored-By: Claude Sonnet 4 --- projects/miopen/test/gtest/adam.hpp | 1 - projects/miopen/test/gtest/addlayernorm.hpp | 1 - projects/miopen/test/gtest/cat.hpp | 1 - projects/miopen/test/gtest/conv3d_test_case.hpp | 1 - projects/miopen/test/gtest/getitem.hpp | 1 - projects/miopen/test/gtest/group_conv.hpp | 1 - projects/miopen/test/gtest/group_conv_deterministic_split_k.cpp | 1 - projects/miopen/test/gtest/groupnorm.hpp | 1 - projects/miopen/test/gtest/kthvalue.hpp | 1 - projects/miopen/test/gtest/layout_transpose.cpp | 1 - projects/miopen/test/gtest/reducecalculation.hpp | 1 - projects/miopen/test/gtest/reduceextreme.hpp | 1 - projects/miopen/test/gtest/rope.hpp | 1 - projects/miopen/test/gtest/t5layernorm.hpp | 1 - projects/miopen/test/gtest/transformers_adam_w.hpp | 1 - 15 files changed, 15 deletions(-) diff --git a/projects/miopen/test/gtest/adam.hpp b/projects/miopen/test/gtest/adam.hpp index 0efd9b390765..e54ddd1fc85d 100644 --- a/projects/miopen/test/gtest/adam.hpp +++ b/projects/miopen/test/gtest/adam.hpp @@ -24,7 +24,6 @@ * *******************************************************************************/ #define MIOPEN_BETA_API 1 -#include "../driver/tensor_driver.hpp" #include "cpu_adam.hpp" #include "get_handle.hpp" #include "random.hpp" diff --git a/projects/miopen/test/gtest/addlayernorm.hpp b/projects/miopen/test/gtest/addlayernorm.hpp index 0eba1588058d..511882710ff8 100644 --- a/projects/miopen/test/gtest/addlayernorm.hpp +++ b/projects/miopen/test/gtest/addlayernorm.hpp @@ -24,7 +24,6 @@ * *******************************************************************************/ -#include "../driver/tensor_driver.hpp" #include "get_handle.hpp" #include "random.hpp" #include "tensor_holder.hpp" diff --git a/projects/miopen/test/gtest/cat.hpp b/projects/miopen/test/gtest/cat.hpp index 8d5fb109e0ea..bf29ccc7bcb0 100644 --- a/projects/miopen/test/gtest/cat.hpp +++ b/projects/miopen/test/gtest/cat.hpp @@ -2,7 +2,6 @@ // SPDX-License-Identifier: MIT #define MIOPEN_BETA_API 1 -#include "../driver/tensor_driver.hpp" #include "cpu_cat.hpp" #include "get_handle.hpp" #include "random.hpp" diff --git a/projects/miopen/test/gtest/conv3d_test_case.hpp b/projects/miopen/test/gtest/conv3d_test_case.hpp index a10c1809cacf..d9a061941703 100644 --- a/projects/miopen/test/gtest/conv3d_test_case.hpp +++ b/projects/miopen/test/gtest/conv3d_test_case.hpp @@ -30,7 +30,6 @@ #include "get_handle.hpp" #include -#include "../driver/tensor_driver.hpp" #include "conv_common.hpp" #include "conv_test_base.hpp" diff --git a/projects/miopen/test/gtest/getitem.hpp b/projects/miopen/test/gtest/getitem.hpp index 22c98ca67b99..8889b1d3d457 100644 --- a/projects/miopen/test/gtest/getitem.hpp +++ b/projects/miopen/test/gtest/getitem.hpp @@ -24,7 +24,6 @@ * *******************************************************************************/ -#include "../driver/tensor_driver.hpp" #include "get_handle.hpp" #include "random.hpp" #include "tensor_holder.hpp" diff --git a/projects/miopen/test/gtest/group_conv.hpp b/projects/miopen/test/gtest/group_conv.hpp index d9ab9e080898..8acdd56548e2 100644 --- a/projects/miopen/test/gtest/group_conv.hpp +++ b/projects/miopen/test/gtest/group_conv.hpp @@ -32,7 +32,6 @@ #include #include -#include "../driver/tensor_driver.hpp" #include "conv_common.hpp" #include "gtest_common.hpp" diff --git a/projects/miopen/test/gtest/group_conv_deterministic_split_k.cpp b/projects/miopen/test/gtest/group_conv_deterministic_split_k.cpp index 3e141b72057e..7f9c62901733 100644 --- a/projects/miopen/test/gtest/group_conv_deterministic_split_k.cpp +++ b/projects/miopen/test/gtest/group_conv_deterministic_split_k.cpp @@ -30,7 +30,6 @@ #include #include "../random.hpp" #include "get_handle.hpp" -#include "../driver/tensor_driver.hpp" #include "conv_common.hpp" #include "gtest_common.hpp" diff --git a/projects/miopen/test/gtest/groupnorm.hpp b/projects/miopen/test/gtest/groupnorm.hpp index 33c4ed105f59..e28c5b652605 100644 --- a/projects/miopen/test/gtest/groupnorm.hpp +++ b/projects/miopen/test/gtest/groupnorm.hpp @@ -31,7 +31,6 @@ #include "cpu_groupnorm.hpp" #include "get_handle.hpp" #include "random.hpp" -#include "../driver/tensor_driver.hpp" #include "verify.hpp" #include diff --git a/projects/miopen/test/gtest/kthvalue.hpp b/projects/miopen/test/gtest/kthvalue.hpp index 2aa7e6fd41d1..58d7db388419 100644 --- a/projects/miopen/test/gtest/kthvalue.hpp +++ b/projects/miopen/test/gtest/kthvalue.hpp @@ -23,7 +23,6 @@ * SOFTWARE. * *******************************************************************************/ -#include "../driver/tensor_driver.hpp" #include "cpu_kthvalue.hpp" #include "get_handle.hpp" diff --git a/projects/miopen/test/gtest/layout_transpose.cpp b/projects/miopen/test/gtest/layout_transpose.cpp index f67c7a0387de..b4c86a99846a 100644 --- a/projects/miopen/test/gtest/layout_transpose.cpp +++ b/projects/miopen/test/gtest/layout_transpose.cpp @@ -25,7 +25,6 @@ *******************************************************************************/ #include -#include "../../driver/conv_common.hpp" #include #include #include diff --git a/projects/miopen/test/gtest/reducecalculation.hpp b/projects/miopen/test/gtest/reducecalculation.hpp index 2f2867423d5f..94b70ac8a1ea 100644 --- a/projects/miopen/test/gtest/reducecalculation.hpp +++ b/projects/miopen/test/gtest/reducecalculation.hpp @@ -24,7 +24,6 @@ * *******************************************************************************/ -#include "../driver/tensor_driver.hpp" #include "../src/kernels/MIOpenReduceCalculation.hpp" #include "get_handle.hpp" #include "random.hpp" diff --git a/projects/miopen/test/gtest/reduceextreme.hpp b/projects/miopen/test/gtest/reduceextreme.hpp index f884bb8fc5cf..4d2658a39569 100644 --- a/projects/miopen/test/gtest/reduceextreme.hpp +++ b/projects/miopen/test/gtest/reduceextreme.hpp @@ -24,7 +24,6 @@ * *******************************************************************************/ -#include "../driver/tensor_driver.hpp" #include "../src/kernels/MIOpenReduceExtreme.hpp" #include "get_handle.hpp" #include "random.hpp" diff --git a/projects/miopen/test/gtest/rope.hpp b/projects/miopen/test/gtest/rope.hpp index 8c8dd2ed2b3d..109ff0549978 100644 --- a/projects/miopen/test/gtest/rope.hpp +++ b/projects/miopen/test/gtest/rope.hpp @@ -24,7 +24,6 @@ * *******************************************************************************/ -#include "../driver/tensor_driver.hpp" #include "get_handle.hpp" #include "random.hpp" #include "tensor_holder.hpp" diff --git a/projects/miopen/test/gtest/t5layernorm.hpp b/projects/miopen/test/gtest/t5layernorm.hpp index 1ee2f2bd6ebe..e71819273683 100644 --- a/projects/miopen/test/gtest/t5layernorm.hpp +++ b/projects/miopen/test/gtest/t5layernorm.hpp @@ -24,7 +24,6 @@ * *******************************************************************************/ -#include "../driver/tensor_driver.hpp" #include "get_handle.hpp" #include "random.hpp" #include "tensor_holder.hpp" diff --git a/projects/miopen/test/gtest/transformers_adam_w.hpp b/projects/miopen/test/gtest/transformers_adam_w.hpp index d2a804841258..ef465fc98854 100644 --- a/projects/miopen/test/gtest/transformers_adam_w.hpp +++ b/projects/miopen/test/gtest/transformers_adam_w.hpp @@ -24,7 +24,6 @@ * *******************************************************************************/ #define MIOPEN_BETA_API 1 -#include "../driver/tensor_driver.hpp" #include "cpu_transformers_adam_w.hpp" #include "get_handle.hpp" #include "random.hpp" From ff45eddb953db509165863cae4e8179e28bcdedd Mon Sep 17 00:00:00 2001 From: Brad Pepers Date: Sat, 9 May 2026 08:07:18 -0600 Subject: [PATCH 05/11] =?UTF-8?q?Eliminate=20remaining=20test=E2=86=92driv?= =?UTF-8?q?er=20cross-includes?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move mloSoftmaxHost.hpp (CPU softmax reference) to miopen_utils. Create gpu_mem.hpp forwarding header in miopen_utils for GPUMem (temporary Phase 1 shim; GPUMem extraction is Phase 2). Update 3 test files to include through miopen_utils instead of directly from driver/. Result: zero cross-includes between driver/ and test/ in either direction. The only remaining Phase 2 cleanup items are: - miopen_utils/gpu_mem.hpp → driver/driver.hpp (extract GPUMem) - common_utils/random.hpp → miopen/env.hpp (env var dependency) Co-Authored-By: Claude Sonnet 4 --- projects/miopen/driver/mloSoftmaxHost.hpp | 349 +---------------- .../include/miopen_utils/gpu_mem.hpp | 12 + .../include/miopen_utils/mloSoftmaxHost.hpp | 350 ++++++++++++++++++ .../test/gtest/find_mode_trust_verify.cpp | 2 +- .../miopen/test/gtest/kernel_tuning_net.cpp | 2 +- projects/miopen/test/gtest/softmax_find20.cpp | 2 +- 6 files changed, 367 insertions(+), 350 deletions(-) create mode 100644 projects/miopen/miopen_utils/include/miopen_utils/gpu_mem.hpp create mode 100644 projects/miopen/miopen_utils/include/miopen_utils/mloSoftmaxHost.hpp diff --git a/projects/miopen/driver/mloSoftmaxHost.hpp b/projects/miopen/driver/mloSoftmaxHost.hpp index fd0a1768e6a6..e0fec924c5b9 100644 --- a/projects/miopen/driver/mloSoftmaxHost.hpp +++ b/projects/miopen/driver/mloSoftmaxHost.hpp @@ -1,350 +1,5 @@ -// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. -// SPDX-License-Identifier: MIT - +// Forwarding header — implementation moved to miopen_utils. #ifndef MLO_SOFTMAXHOST_H_ #define MLO_SOFTMAXHOST_H_ - -#include -#include - -//////////////////////////////////////////////////////////// -// -/////////////////////////////////////////////////////////// - -#define NEGATIVE_INF_FP32 (-1e20) -#define NEGATIVE_INF_FP16 (-1e5) - -template -T logaddexp(T x, T y, T neg_inf) -{ - T a = std::max(x, y); - T b = std::min(x, y); - T c = b - a; - - return c <= neg_inf ? std::max(a, neg_inf) : std::max(T(a + log(T(1) + exp(b - a))), neg_inf); -} - -template -int mloSoftmaxForwardRunHost(miopenTensorDescriptor_t inputTensor, - miopenTensorDescriptor_t outputTensor, - Tgpu* in, - Tcheck* outhost, - float alpha, - float beta, - miopenSoftmaxAlgorithm_t algo, - miopenSoftmaxMode_t mode) -{ - int n, c, h, w, in_nstr, in_cstr, in_hstr, in_wstr; - int out_nstr, out_cstr, out_hstr, out_wstr; - miopenGet4dTensorDescriptorLengths(inputTensor, &n, &c, &h, &w); - miopenGet4dTensorDescriptorStrides(inputTensor, &in_nstr, &in_cstr, &in_hstr, &in_wstr); - miopenGet4dTensorDescriptorStrides(outputTensor, &out_nstr, &out_cstr, &out_hstr, &out_wstr); - - Tcheck max_val = (sizeof(Tgpu) == 4) ? 3.402823466e+38f : 65504.; - std::vector channel_max((mode == MIOPEN_SOFTMAX_MODE_INSTANCE ? n : n * h * w), - static_cast(-max_val)); - std::vector results(n * c * h * w, static_cast(0.0)); - - int ret = 0; - - if(mode == MIOPEN_SOFTMAX_MODE_INSTANCE) - { - for(int i = 0; i < n; i++) - { - if(algo == MIOPEN_SOFTMAX_FAST) - { - for(int j = 0; j < c; j++) - for(int s0 = 0; s0 < h; s0++) - for(int s1 = 0; s1 < w; s1++) - { - results[(i * c + j) * h * w + s0 * w + s1] = static_cast( - in[i * in_nstr + j * in_cstr + s0 * in_hstr + s1 * in_wstr]); - } - } - else - { - for(int j = 0; j < c; j++) - for(int s0 = 0; s0 < h; s0++) - for(int s1 = 0; s1 < w; s1++) - { - channel_max[i] = std::max( - static_cast( - in[i * in_nstr + j * in_cstr + s0 * in_hstr + s1 * in_wstr]), - channel_max[i]); - } - - for(int j = 0; j < c; j++) - for(int s0 = 0; s0 < h; s0++) - for(int s1 = 0; s1 < w; s1++) - { - results[(i * c + j) * h * w + s0 * w + s1] = - static_cast( - in[i * in_nstr + j * in_cstr + s0 * in_hstr + s1 * in_wstr]) - - channel_max[i]; - } - } - - if(algo == MIOPEN_SOFTMAX_LOG) - { - Tcheck neg_inf = static_cast( - miopen::deref(inputTensor).GetType() == miopenHalf ? NEGATIVE_INF_FP16 - : NEGATIVE_INF_FP32); - channel_max[i] = neg_inf; - for(int j = 0; j < c; j++) - for(int s0 = 0; s0 < h; s0++) - for(int s1 = 0; s1 < w; s1++) - { - channel_max[i] = logaddexp(results[(i * c + j) * h * w + s0 * w + s1], - channel_max[i], - neg_inf); - } - - for(int j = 0; j < c; j++) - for(int s0 = 0; s0 < h; s0++) - for(int s1 = 0; s1 < w; s1++) - { - outhost[i * out_nstr + j * out_cstr + s0 * out_hstr + s1 * out_wstr] = - alpha * - (results[(i * c + j) * h * w + s0 * w + s1] - channel_max[i]) + - beta * outhost[i * out_nstr + j * out_cstr + s0 * out_hstr + - s1 * out_wstr]; - } - } - else - { - channel_max[i] = 0.0; - for(int j = 0; j < c; j++) - for(int s0 = 0; s0 < h; s0++) - for(int s1 = 0; s1 < w; s1++) - { - results[(i * c + j) * h * w + s0 * w + s1] = - exp(results[(i * c + j) * h * w + s0 * w + s1]); - channel_max[i] += results[(i * c + j) * h * w + s0 * w + s1]; - } - - for(int j = 0; j < c; j++) - for(int s0 = 0; s0 < h; s0++) - for(int s1 = 0; s1 < w; s1++) - { - outhost[i * out_nstr + j * out_cstr + s0 * out_hstr + s1 * out_wstr] = - alpha * - (results[(i * c + j) * h * w + s0 * w + s1] / channel_max[i]) + - beta * outhost[i * out_nstr + j * out_cstr + s0 * out_hstr + - s1 * out_wstr]; - } - } - } - } - else - { - for(int i = 0; i < n; i++) - { - for(int s0 = 0; s0 < h; s0++) - for(int s1 = 0; s1 < w; s1++) - { - if(algo == MIOPEN_SOFTMAX_FAST) - { - for(int j = 0; j < c; j++) - { - results[(i * c + j) * h * w + s0 * w + s1] = static_cast( - in[i * in_nstr + j * in_cstr + s0 * in_hstr + s1 * in_wstr]); - } - } - else - { - for(int j = 0; j < c; j++) - { - channel_max[i * h * w + s0 * w + s1] = std::max( - static_cast( - in[i * in_nstr + j * in_cstr + s0 * in_hstr + s1 * in_wstr]), - channel_max[i * h * w + s0 * w + s1]); - } - - for(int j = 0; j < c; j++) - { - results[(i * c + j) * h * w + s0 * w + s1] = - static_cast( - in[i * in_nstr + j * in_cstr + s0 * in_hstr + s1 * in_wstr]) - - channel_max[i * h * w + s0 * w + s1]; - } - } - - if(algo == MIOPEN_SOFTMAX_LOG) - { - Tcheck neg_inf = static_cast( - miopen::deref(inputTensor).GetType() == miopenHalf ? NEGATIVE_INF_FP16 - : NEGATIVE_INF_FP32); - channel_max[i * h * w + s0 * w + s1] = results[i * c * h * w + s0 * w + s1]; - for(int j = 1; j < c; j++) - { - channel_max[i * h * w + s0 * w + s1] = - logaddexp(results[(i * c + j) * h * w + s0 * w + s1], - channel_max[i * h * w + s0 * w + s1], - neg_inf); - } - - for(int j = 0; j < c; j++) - { - outhost[i * out_nstr + j * out_cstr + s0 * out_hstr + s1 * out_wstr] = - alpha * (results[(i * c + j) * h * w + s0 * w + s1] - - channel_max[i * h * w + s0 * w + s1]) + - beta * outhost[i * out_nstr + j * out_cstr + s0 * out_hstr + - s1 * out_wstr]; - } - } - else - { - channel_max[i * h * w + s0 * w + s1] = 0.0; - for(int j = 0; j < c; j++) - { - results[(i * c + j) * h * w + s0 * w + s1] = - exp(results[(i * c + j) * h * w + s0 * w + s1]); - channel_max[i * h * w + s0 * w + s1] += - results[(i * c + j) * h * w + s0 * w + s1]; - } - - for(int j = 0; j < c; j++) - { - outhost[i * out_nstr + j * out_cstr + s0 * out_hstr + s1 * out_wstr] = - alpha * (results[(i * c + j) * h * w + s0 * w + s1] / - channel_max[i * h * w + s0 * w + s1]) + - beta * outhost[i * out_nstr + j * out_cstr + s0 * out_hstr + - s1 * out_wstr]; - } - } - } - } - } - - return ret; -} - -template -int mloSoftmaxBackwardRunHost(miopenTensorDescriptor_t dInputTensor, - miopenTensorDescriptor_t dOutputTensor, - Tgpu* out, - Tgpu* dout, - Tcheck* dinhost, - float alpha, - float beta, - miopenSoftmaxAlgorithm_t algo, - miopenSoftmaxMode_t mode) -{ - int n, c, h, w, in_nstr, in_cstr, in_hstr, in_wstr; - int out_nstr, out_cstr, out_hstr, out_wstr; - miopenGet4dTensorDescriptorLengths(dOutputTensor, &n, &c, &h, &w); - miopenGet4dTensorDescriptorStrides(dInputTensor, &in_nstr, &in_cstr, &in_hstr, &in_wstr); - miopenGet4dTensorDescriptorStrides(dOutputTensor, &out_nstr, &out_cstr, &out_hstr, &out_wstr); - - std::vector channel_dot((mode == MIOPEN_SOFTMAX_MODE_INSTANCE ? n : n * h * w), - static_cast(0.0)); - std::vector results(n * c * h * w, static_cast(0.0)); - - int ret = 0; - - for(int i = 0; i < n; i++) - { - if(mode == MIOPEN_SOFTMAX_MODE_INSTANCE) - { - for(int j = 0; j < c; j++) - for(int s0 = 0; s0 < h; s0++) - for(int s1 = 0; s1 < w; s1++) - { - if(algo == MIOPEN_SOFTMAX_LOG) - { - channel_dot[i] += static_cast( - dout[i * out_nstr + j * out_cstr + s0 * out_hstr + s1 * out_wstr]); - } - else - { - channel_dot[i] += - static_cast(out[i * out_nstr + j * out_cstr + - s0 * out_hstr + s1 * out_wstr]) * - static_cast(dout[i * out_nstr + j * out_cstr + - s0 * out_hstr + s1 * out_wstr]); - } - } - - for(int j = 0; j < c; j++) - for(int s0 = 0; s0 < h; s0++) - for(int s1 = 0; s1 < w; s1++) - { - if(algo == MIOPEN_SOFTMAX_LOG) - { - results[(i * c + j) * h * w + s0 * w + s1] = - static_cast(dout[i * out_nstr + j * out_cstr + - s0 * out_hstr + s1 * out_wstr]) - - channel_dot[i] * std::exp(out[i * out_nstr + j * out_cstr + - s0 * out_hstr + s1 * out_wstr]); - } - else - { - results[(i * c + j) * h * w + s0 * w + s1] = - static_cast(dout[i * out_nstr + j * out_cstr + - s0 * out_hstr + s1 * out_wstr]) - - channel_dot[i]; - - results[(i * c + j) * h * w + s0 * w + s1] *= static_cast( - out[i * out_nstr + j * out_cstr + s0 * out_hstr + s1 * out_wstr]); - } - dinhost[i * in_nstr + j * in_cstr + s0 * in_hstr + s1 * in_wstr] = - alpha * results[(i * c + j) * h * w + s0 * w + s1] + - beta * dinhost[i * in_nstr + j * in_cstr + s0 * in_hstr + s1 * in_wstr]; - } - } - else - { - for(int s0 = 0; s0 < h; s0++) - for(int s1 = 0; s1 < w; s1++) - { - for(int j = 0; j < c; j++) - { - if(algo == MIOPEN_SOFTMAX_LOG) - { - channel_dot[i * h * w + s0 * w + s1] += static_cast( - dout[i * out_nstr + j * out_cstr + s0 * out_hstr + s1 * out_wstr]); - } - else - { - channel_dot[i * h * w + s0 * w + s1] += - static_cast(out[i * out_nstr + j * out_cstr + - s0 * out_hstr + s1 * out_wstr]) * - static_cast(dout[i * out_nstr + j * out_cstr + - s0 * out_hstr + s1 * out_wstr]); - } - } - - for(int j = 0; j < c; j++) - { - if(algo == MIOPEN_SOFTMAX_LOG) - { - results[(i * c + j) * h * w + s0 * w + s1] = - static_cast(dout[i * out_nstr + j * out_cstr + - s0 * out_hstr + s1 * out_wstr]) - - channel_dot[i * h * w + s0 * w + s1] * - std::exp(out[i * out_nstr + j * out_cstr + s0 * out_hstr + - s1 * out_wstr]); - } - else - { - results[(i * c + j) * h * w + s0 * w + s1] = - static_cast(dout[i * out_nstr + j * out_cstr + - s0 * out_hstr + s1 * out_wstr]) - - channel_dot[i * h * w + s0 * w + s1]; - - results[(i * c + j) * h * w + s0 * w + s1] *= static_cast( - out[i * out_nstr + j * out_cstr + s0 * out_hstr + s1 * out_wstr]); - } - dinhost[i * in_nstr + j * in_cstr + s0 * in_hstr + s1 * in_wstr] = - alpha * results[(i * c + j) * h * w + s0 * w + s1] + - beta * dinhost[i * in_nstr + j * in_cstr + s0 * in_hstr + s1 * in_wstr]; - } - } - } - } - - return ret; -} - +#include #endif diff --git a/projects/miopen/miopen_utils/include/miopen_utils/gpu_mem.hpp b/projects/miopen/miopen_utils/include/miopen_utils/gpu_mem.hpp new file mode 100644 index 000000000000..ee1f52b3090d --- /dev/null +++ b/projects/miopen/miopen_utils/include/miopen_utils/gpu_mem.hpp @@ -0,0 +1,12 @@ +// Forwarding header — GPUMem is defined in driver/driver.hpp. +// This allows test code to include GPUMem without directly depending +// on the driver/ directory. The GPUMem class should eventually be +// extracted into a standalone header here. +#ifndef GUARD_MIOPEN_UTILS_GPU_MEM_HPP +#define GUARD_MIOPEN_UTILS_GPU_MEM_HPP + +// Phase 1: Forward to driver.hpp which defines GPUMem. +// Phase 2: Extract GPUMem into this file directly. +#include "../../driver/driver.hpp" + +#endif diff --git a/projects/miopen/miopen_utils/include/miopen_utils/mloSoftmaxHost.hpp b/projects/miopen/miopen_utils/include/miopen_utils/mloSoftmaxHost.hpp new file mode 100644 index 000000000000..fd0a1768e6a6 --- /dev/null +++ b/projects/miopen/miopen_utils/include/miopen_utils/mloSoftmaxHost.hpp @@ -0,0 +1,350 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + +#ifndef MLO_SOFTMAXHOST_H_ +#define MLO_SOFTMAXHOST_H_ + +#include +#include + +//////////////////////////////////////////////////////////// +// +/////////////////////////////////////////////////////////// + +#define NEGATIVE_INF_FP32 (-1e20) +#define NEGATIVE_INF_FP16 (-1e5) + +template +T logaddexp(T x, T y, T neg_inf) +{ + T a = std::max(x, y); + T b = std::min(x, y); + T c = b - a; + + return c <= neg_inf ? std::max(a, neg_inf) : std::max(T(a + log(T(1) + exp(b - a))), neg_inf); +} + +template +int mloSoftmaxForwardRunHost(miopenTensorDescriptor_t inputTensor, + miopenTensorDescriptor_t outputTensor, + Tgpu* in, + Tcheck* outhost, + float alpha, + float beta, + miopenSoftmaxAlgorithm_t algo, + miopenSoftmaxMode_t mode) +{ + int n, c, h, w, in_nstr, in_cstr, in_hstr, in_wstr; + int out_nstr, out_cstr, out_hstr, out_wstr; + miopenGet4dTensorDescriptorLengths(inputTensor, &n, &c, &h, &w); + miopenGet4dTensorDescriptorStrides(inputTensor, &in_nstr, &in_cstr, &in_hstr, &in_wstr); + miopenGet4dTensorDescriptorStrides(outputTensor, &out_nstr, &out_cstr, &out_hstr, &out_wstr); + + Tcheck max_val = (sizeof(Tgpu) == 4) ? 3.402823466e+38f : 65504.; + std::vector channel_max((mode == MIOPEN_SOFTMAX_MODE_INSTANCE ? n : n * h * w), + static_cast(-max_val)); + std::vector results(n * c * h * w, static_cast(0.0)); + + int ret = 0; + + if(mode == MIOPEN_SOFTMAX_MODE_INSTANCE) + { + for(int i = 0; i < n; i++) + { + if(algo == MIOPEN_SOFTMAX_FAST) + { + for(int j = 0; j < c; j++) + for(int s0 = 0; s0 < h; s0++) + for(int s1 = 0; s1 < w; s1++) + { + results[(i * c + j) * h * w + s0 * w + s1] = static_cast( + in[i * in_nstr + j * in_cstr + s0 * in_hstr + s1 * in_wstr]); + } + } + else + { + for(int j = 0; j < c; j++) + for(int s0 = 0; s0 < h; s0++) + for(int s1 = 0; s1 < w; s1++) + { + channel_max[i] = std::max( + static_cast( + in[i * in_nstr + j * in_cstr + s0 * in_hstr + s1 * in_wstr]), + channel_max[i]); + } + + for(int j = 0; j < c; j++) + for(int s0 = 0; s0 < h; s0++) + for(int s1 = 0; s1 < w; s1++) + { + results[(i * c + j) * h * w + s0 * w + s1] = + static_cast( + in[i * in_nstr + j * in_cstr + s0 * in_hstr + s1 * in_wstr]) - + channel_max[i]; + } + } + + if(algo == MIOPEN_SOFTMAX_LOG) + { + Tcheck neg_inf = static_cast( + miopen::deref(inputTensor).GetType() == miopenHalf ? NEGATIVE_INF_FP16 + : NEGATIVE_INF_FP32); + channel_max[i] = neg_inf; + for(int j = 0; j < c; j++) + for(int s0 = 0; s0 < h; s0++) + for(int s1 = 0; s1 < w; s1++) + { + channel_max[i] = logaddexp(results[(i * c + j) * h * w + s0 * w + s1], + channel_max[i], + neg_inf); + } + + for(int j = 0; j < c; j++) + for(int s0 = 0; s0 < h; s0++) + for(int s1 = 0; s1 < w; s1++) + { + outhost[i * out_nstr + j * out_cstr + s0 * out_hstr + s1 * out_wstr] = + alpha * + (results[(i * c + j) * h * w + s0 * w + s1] - channel_max[i]) + + beta * outhost[i * out_nstr + j * out_cstr + s0 * out_hstr + + s1 * out_wstr]; + } + } + else + { + channel_max[i] = 0.0; + for(int j = 0; j < c; j++) + for(int s0 = 0; s0 < h; s0++) + for(int s1 = 0; s1 < w; s1++) + { + results[(i * c + j) * h * w + s0 * w + s1] = + exp(results[(i * c + j) * h * w + s0 * w + s1]); + channel_max[i] += results[(i * c + j) * h * w + s0 * w + s1]; + } + + for(int j = 0; j < c; j++) + for(int s0 = 0; s0 < h; s0++) + for(int s1 = 0; s1 < w; s1++) + { + outhost[i * out_nstr + j * out_cstr + s0 * out_hstr + s1 * out_wstr] = + alpha * + (results[(i * c + j) * h * w + s0 * w + s1] / channel_max[i]) + + beta * outhost[i * out_nstr + j * out_cstr + s0 * out_hstr + + s1 * out_wstr]; + } + } + } + } + else + { + for(int i = 0; i < n; i++) + { + for(int s0 = 0; s0 < h; s0++) + for(int s1 = 0; s1 < w; s1++) + { + if(algo == MIOPEN_SOFTMAX_FAST) + { + for(int j = 0; j < c; j++) + { + results[(i * c + j) * h * w + s0 * w + s1] = static_cast( + in[i * in_nstr + j * in_cstr + s0 * in_hstr + s1 * in_wstr]); + } + } + else + { + for(int j = 0; j < c; j++) + { + channel_max[i * h * w + s0 * w + s1] = std::max( + static_cast( + in[i * in_nstr + j * in_cstr + s0 * in_hstr + s1 * in_wstr]), + channel_max[i * h * w + s0 * w + s1]); + } + + for(int j = 0; j < c; j++) + { + results[(i * c + j) * h * w + s0 * w + s1] = + static_cast( + in[i * in_nstr + j * in_cstr + s0 * in_hstr + s1 * in_wstr]) - + channel_max[i * h * w + s0 * w + s1]; + } + } + + if(algo == MIOPEN_SOFTMAX_LOG) + { + Tcheck neg_inf = static_cast( + miopen::deref(inputTensor).GetType() == miopenHalf ? NEGATIVE_INF_FP16 + : NEGATIVE_INF_FP32); + channel_max[i * h * w + s0 * w + s1] = results[i * c * h * w + s0 * w + s1]; + for(int j = 1; j < c; j++) + { + channel_max[i * h * w + s0 * w + s1] = + logaddexp(results[(i * c + j) * h * w + s0 * w + s1], + channel_max[i * h * w + s0 * w + s1], + neg_inf); + } + + for(int j = 0; j < c; j++) + { + outhost[i * out_nstr + j * out_cstr + s0 * out_hstr + s1 * out_wstr] = + alpha * (results[(i * c + j) * h * w + s0 * w + s1] - + channel_max[i * h * w + s0 * w + s1]) + + beta * outhost[i * out_nstr + j * out_cstr + s0 * out_hstr + + s1 * out_wstr]; + } + } + else + { + channel_max[i * h * w + s0 * w + s1] = 0.0; + for(int j = 0; j < c; j++) + { + results[(i * c + j) * h * w + s0 * w + s1] = + exp(results[(i * c + j) * h * w + s0 * w + s1]); + channel_max[i * h * w + s0 * w + s1] += + results[(i * c + j) * h * w + s0 * w + s1]; + } + + for(int j = 0; j < c; j++) + { + outhost[i * out_nstr + j * out_cstr + s0 * out_hstr + s1 * out_wstr] = + alpha * (results[(i * c + j) * h * w + s0 * w + s1] / + channel_max[i * h * w + s0 * w + s1]) + + beta * outhost[i * out_nstr + j * out_cstr + s0 * out_hstr + + s1 * out_wstr]; + } + } + } + } + } + + return ret; +} + +template +int mloSoftmaxBackwardRunHost(miopenTensorDescriptor_t dInputTensor, + miopenTensorDescriptor_t dOutputTensor, + Tgpu* out, + Tgpu* dout, + Tcheck* dinhost, + float alpha, + float beta, + miopenSoftmaxAlgorithm_t algo, + miopenSoftmaxMode_t mode) +{ + int n, c, h, w, in_nstr, in_cstr, in_hstr, in_wstr; + int out_nstr, out_cstr, out_hstr, out_wstr; + miopenGet4dTensorDescriptorLengths(dOutputTensor, &n, &c, &h, &w); + miopenGet4dTensorDescriptorStrides(dInputTensor, &in_nstr, &in_cstr, &in_hstr, &in_wstr); + miopenGet4dTensorDescriptorStrides(dOutputTensor, &out_nstr, &out_cstr, &out_hstr, &out_wstr); + + std::vector channel_dot((mode == MIOPEN_SOFTMAX_MODE_INSTANCE ? n : n * h * w), + static_cast(0.0)); + std::vector results(n * c * h * w, static_cast(0.0)); + + int ret = 0; + + for(int i = 0; i < n; i++) + { + if(mode == MIOPEN_SOFTMAX_MODE_INSTANCE) + { + for(int j = 0; j < c; j++) + for(int s0 = 0; s0 < h; s0++) + for(int s1 = 0; s1 < w; s1++) + { + if(algo == MIOPEN_SOFTMAX_LOG) + { + channel_dot[i] += static_cast( + dout[i * out_nstr + j * out_cstr + s0 * out_hstr + s1 * out_wstr]); + } + else + { + channel_dot[i] += + static_cast(out[i * out_nstr + j * out_cstr + + s0 * out_hstr + s1 * out_wstr]) * + static_cast(dout[i * out_nstr + j * out_cstr + + s0 * out_hstr + s1 * out_wstr]); + } + } + + for(int j = 0; j < c; j++) + for(int s0 = 0; s0 < h; s0++) + for(int s1 = 0; s1 < w; s1++) + { + if(algo == MIOPEN_SOFTMAX_LOG) + { + results[(i * c + j) * h * w + s0 * w + s1] = + static_cast(dout[i * out_nstr + j * out_cstr + + s0 * out_hstr + s1 * out_wstr]) - + channel_dot[i] * std::exp(out[i * out_nstr + j * out_cstr + + s0 * out_hstr + s1 * out_wstr]); + } + else + { + results[(i * c + j) * h * w + s0 * w + s1] = + static_cast(dout[i * out_nstr + j * out_cstr + + s0 * out_hstr + s1 * out_wstr]) - + channel_dot[i]; + + results[(i * c + j) * h * w + s0 * w + s1] *= static_cast( + out[i * out_nstr + j * out_cstr + s0 * out_hstr + s1 * out_wstr]); + } + dinhost[i * in_nstr + j * in_cstr + s0 * in_hstr + s1 * in_wstr] = + alpha * results[(i * c + j) * h * w + s0 * w + s1] + + beta * dinhost[i * in_nstr + j * in_cstr + s0 * in_hstr + s1 * in_wstr]; + } + } + else + { + for(int s0 = 0; s0 < h; s0++) + for(int s1 = 0; s1 < w; s1++) + { + for(int j = 0; j < c; j++) + { + if(algo == MIOPEN_SOFTMAX_LOG) + { + channel_dot[i * h * w + s0 * w + s1] += static_cast( + dout[i * out_nstr + j * out_cstr + s0 * out_hstr + s1 * out_wstr]); + } + else + { + channel_dot[i * h * w + s0 * w + s1] += + static_cast(out[i * out_nstr + j * out_cstr + + s0 * out_hstr + s1 * out_wstr]) * + static_cast(dout[i * out_nstr + j * out_cstr + + s0 * out_hstr + s1 * out_wstr]); + } + } + + for(int j = 0; j < c; j++) + { + if(algo == MIOPEN_SOFTMAX_LOG) + { + results[(i * c + j) * h * w + s0 * w + s1] = + static_cast(dout[i * out_nstr + j * out_cstr + + s0 * out_hstr + s1 * out_wstr]) - + channel_dot[i * h * w + s0 * w + s1] * + std::exp(out[i * out_nstr + j * out_cstr + s0 * out_hstr + + s1 * out_wstr]); + } + else + { + results[(i * c + j) * h * w + s0 * w + s1] = + static_cast(dout[i * out_nstr + j * out_cstr + + s0 * out_hstr + s1 * out_wstr]) - + channel_dot[i * h * w + s0 * w + s1]; + + results[(i * c + j) * h * w + s0 * w + s1] *= static_cast( + out[i * out_nstr + j * out_cstr + s0 * out_hstr + s1 * out_wstr]); + } + dinhost[i * in_nstr + j * in_cstr + s0 * in_hstr + s1 * in_wstr] = + alpha * results[(i * c + j) * h * w + s0 * w + s1] + + beta * dinhost[i * in_nstr + j * in_cstr + s0 * in_hstr + s1 * in_wstr]; + } + } + } + } + + return ret; +} + +#endif diff --git a/projects/miopen/test/gtest/find_mode_trust_verify.cpp b/projects/miopen/test/gtest/find_mode_trust_verify.cpp index 021a593f3372..178b1edff149 100644 --- a/projects/miopen/test/gtest/find_mode_trust_verify.cpp +++ b/projects/miopen/test/gtest/find_mode_trust_verify.cpp @@ -26,7 +26,7 @@ #include #include -#include "../../driver/driver.hpp" +#include namespace miopen { std::vector diff --git a/projects/miopen/test/gtest/kernel_tuning_net.cpp b/projects/miopen/test/gtest/kernel_tuning_net.cpp index 304adb9800d4..760a099b2ef4 100644 --- a/projects/miopen/test/gtest/kernel_tuning_net.cpp +++ b/projects/miopen/test/gtest/kernel_tuning_net.cpp @@ -30,7 +30,7 @@ #include #include #include -#include "../../driver/driver.hpp" +#include struct KernelTuningNetTestCase : AIModelTestCase { diff --git a/projects/miopen/test/gtest/softmax_find20.cpp b/projects/miopen/test/gtest/softmax_find20.cpp index c3f4857c38c8..d9acb567b7c8 100644 --- a/projects/miopen/test/gtest/softmax_find20.cpp +++ b/projects/miopen/test/gtest/softmax_find20.cpp @@ -28,7 +28,7 @@ #include "test.hpp" #include "get_handle.hpp" #include "tensor_holder.hpp" -#include "../driver/mloSoftmaxHost.hpp" +#include #include "verify.hpp" #include From 2da91e01a89914c945c33d18ac7fa412a85c8a03 Mon Sep 17 00:00:00 2001 From: Brad Pepers Date: Sat, 9 May 2026 10:19:25 -0600 Subject: [PATCH 06/11] Fix forwarding headers: remove duplicate include guards MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The forwarding headers (e.g., src/include/miopen/rank.hpp) and their targets (e.g., common_utils/include/common_utils/rank.hpp) used the same include guard macro. This caused the target's content to be skipped when included through the forwarding header, since the guard was already defined by the forwarder. Fix: Remove include guards from all forwarding headers entirely. They contain no content of their own — just a single #include — so the target file's own guard provides all necessary protection. Affects 26 forwarding headers across src/include/miopen/, test/, and driver/. Co-Authored-By: Claude Sonnet 4 --- projects/miopen/driver/mloSoftmaxHost.hpp | 3 -- projects/miopen/driver/random.hpp | 3 -- .../miopen/src/include/miopen/algorithm.hpp | 30 +------------------ .../miopen/src/include/miopen/bfloat16.hpp | 3 -- .../miopen/src/include/miopen/each_args.hpp | 30 +------------------ .../miopen/src/include/miopen/float_equal.hpp | 30 +------------------ projects/miopen/src/include/miopen/ford.hpp | 3 -- .../miopen/src/include/miopen/functional.hpp | 3 -- .../miopen/src/include/miopen/par_for.hpp | 30 +------------------ projects/miopen/src/include/miopen/rank.hpp | 30 +------------------ .../src/include/miopen/reduce_common.hpp | 3 -- .../miopen/src/include/miopen/returns.hpp | 30 +------------------ .../miopen/src/include/miopen/stringutils.hpp | 3 -- .../miopen/src/include/miopen/type_name.hpp | 30 +------------------ projects/miopen/test/cpu_bias.hpp | 3 -- projects/miopen/test/cpu_conv.hpp | 3 -- projects/miopen/test/cpu_layernorm.hpp | 3 -- projects/miopen/test/cpu_reduce_util.hpp | 3 -- projects/miopen/test/fusionHost.hpp | 1 - projects/miopen/test/gemm.hpp | 3 -- projects/miopen/test/network_data.hpp | 3 -- projects/miopen/test/random.hpp | 3 -- projects/miopen/test/rnn_util.hpp | 3 -- projects/miopen/test/serialize.hpp | 3 -- projects/miopen/test/tensor_holder.hpp | 3 -- projects/miopen/test/verify.hpp | 3 -- 26 files changed, 7 insertions(+), 258 deletions(-) diff --git a/projects/miopen/driver/mloSoftmaxHost.hpp b/projects/miopen/driver/mloSoftmaxHost.hpp index e0fec924c5b9..928eb6f63490 100644 --- a/projects/miopen/driver/mloSoftmaxHost.hpp +++ b/projects/miopen/driver/mloSoftmaxHost.hpp @@ -1,5 +1,2 @@ // Forwarding header — implementation moved to miopen_utils. -#ifndef MLO_SOFTMAXHOST_H_ -#define MLO_SOFTMAXHOST_H_ #include -#endif diff --git a/projects/miopen/driver/random.hpp b/projects/miopen/driver/random.hpp index 81e630411c67..30be9387d99c 100644 --- a/projects/miopen/driver/random.hpp +++ b/projects/miopen/driver/random.hpp @@ -1,5 +1,2 @@ // Forwarding header — implementation moved to common_utils. -#ifndef GUARD_RANDOM_GEN_ -#define GUARD_RANDOM_GEN_ #include -#endif diff --git a/projects/miopen/src/include/miopen/algorithm.hpp b/projects/miopen/src/include/miopen/algorithm.hpp index 91b0383b823b..38b87c1e38b4 100644 --- a/projects/miopen/src/include/miopen/algorithm.hpp +++ b/projects/miopen/src/include/miopen/algorithm.hpp @@ -1,30 +1,2 @@ -/******************************************************************************* - * - * MIT License - * - * Copyright (c) 2019 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - *******************************************************************************/ -// Forwarding header -- implementation moved to common_utils. -#ifndef GUARD_MLOPEN_ALGORITHM_HPP -#define GUARD_MLOPEN_ALGORITHM_HPP +// Forwarding header — implementation moved to common_utils. #include -#endif diff --git a/projects/miopen/src/include/miopen/bfloat16.hpp b/projects/miopen/src/include/miopen/bfloat16.hpp index fc3880629c68..eab3c5b2c826 100644 --- a/projects/miopen/src/include/miopen/bfloat16.hpp +++ b/projects/miopen/src/include/miopen/bfloat16.hpp @@ -1,5 +1,2 @@ // Forwarding header — implementation moved to common_utils. -#ifndef BFLOAT16_H_ -#define BFLOAT16_H_ #include -#endif diff --git a/projects/miopen/src/include/miopen/each_args.hpp b/projects/miopen/src/include/miopen/each_args.hpp index 646fd53d263f..983c7da843dd 100644 --- a/projects/miopen/src/include/miopen/each_args.hpp +++ b/projects/miopen/src/include/miopen/each_args.hpp @@ -1,30 +1,2 @@ -/******************************************************************************* - * - * MIT License - * - * Copyright (c) 2017 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - *******************************************************************************/ -// Forwarding header -- implementation moved to common_utils. -#ifndef GUARD_MIOPEN_EACH_ARGS_HPP -#define GUARD_MIOPEN_EACH_ARGS_HPP +// Forwarding header — implementation moved to common_utils. #include -#endif diff --git a/projects/miopen/src/include/miopen/float_equal.hpp b/projects/miopen/src/include/miopen/float_equal.hpp index 43bd3d7ab14a..a48c2e417489 100644 --- a/projects/miopen/src/include/miopen/float_equal.hpp +++ b/projects/miopen/src/include/miopen/float_equal.hpp @@ -1,30 +1,2 @@ -/******************************************************************************* - * - * MIT License - * - * Copyright (c) 2017 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - *******************************************************************************/ -// Forwarding header -- implementation moved to common_utils. -#ifndef GUARD_MLOPEN_FLOAT_EQUAL_HPP -#define GUARD_MLOPEN_FLOAT_EQUAL_HPP +// Forwarding header — implementation moved to common_utils. #include -#endif diff --git a/projects/miopen/src/include/miopen/ford.hpp b/projects/miopen/src/include/miopen/ford.hpp index 0dc62c9ae495..beac57e1e6e8 100644 --- a/projects/miopen/src/include/miopen/ford.hpp +++ b/projects/miopen/src/include/miopen/ford.hpp @@ -1,5 +1,2 @@ // Forwarding header — implementation moved to common_utils. -#ifndef GUARD_FORD_HPP -#define GUARD_FORD_HPP #include -#endif diff --git a/projects/miopen/src/include/miopen/functional.hpp b/projects/miopen/src/include/miopen/functional.hpp index d1f7cb973349..d0a70ae6794d 100644 --- a/projects/miopen/src/include/miopen/functional.hpp +++ b/projects/miopen/src/include/miopen/functional.hpp @@ -1,5 +1,2 @@ // Forwarding header — implementation moved to common_utils. -#ifndef GUARD_MLOPEN_FUNCTIONAL_HPP -#define GUARD_MLOPEN_FUNCTIONAL_HPP #include -#endif diff --git a/projects/miopen/src/include/miopen/par_for.hpp b/projects/miopen/src/include/miopen/par_for.hpp index 71a1125de408..4685b005db77 100644 --- a/projects/miopen/src/include/miopen/par_for.hpp +++ b/projects/miopen/src/include/miopen/par_for.hpp @@ -1,30 +1,2 @@ -/******************************************************************************* - * - * MIT License - * - * Copyright (c) 2020 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - *******************************************************************************/ -// Forwarding header -- implementation moved to common_utils. -#ifndef MIOPEN_GUARD_MLOPEN_PAR_FOR_HPP -#define MIOPEN_GUARD_MLOPEN_PAR_FOR_HPP +// Forwarding header — implementation moved to common_utils. #include -#endif diff --git a/projects/miopen/src/include/miopen/rank.hpp b/projects/miopen/src/include/miopen/rank.hpp index 1756782673ad..88a4541421d4 100644 --- a/projects/miopen/src/include/miopen/rank.hpp +++ b/projects/miopen/src/include/miopen/rank.hpp @@ -1,30 +1,2 @@ -/******************************************************************************* - * - * MIT License - * - * Copyright (c) 2017 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - *******************************************************************************/ -// Forwarding header -- implementation moved to common_utils. -#ifndef GUARD_MIOPEN_RANK_HPP -#define GUARD_MIOPEN_RANK_HPP +// Forwarding header — implementation moved to common_utils. #include -#endif diff --git a/projects/miopen/src/include/miopen/reduce_common.hpp b/projects/miopen/src/include/miopen/reduce_common.hpp index f1bd0b38e320..8d47ee0f05b0 100644 --- a/projects/miopen/src/include/miopen/reduce_common.hpp +++ b/projects/miopen/src/include/miopen/reduce_common.hpp @@ -1,5 +1,2 @@ // Forwarding header — implementation moved to common_utils. -#ifndef GUARD_MIOPEN_REDUCE_COMMON_HPP -#define GUARD_MIOPEN_REDUCE_COMMON_HPP #include -#endif diff --git a/projects/miopen/src/include/miopen/returns.hpp b/projects/miopen/src/include/miopen/returns.hpp index dd0873cfb2b3..8bd3067fdea3 100644 --- a/projects/miopen/src/include/miopen/returns.hpp +++ b/projects/miopen/src/include/miopen/returns.hpp @@ -1,30 +1,2 @@ -/******************************************************************************* - * - * MIT License - * - * Copyright (c) 2017 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - *******************************************************************************/ -// Forwarding header -- implementation moved to common_utils. -#ifndef GUARD_MIOPEN_RETURNS_HPP -#define GUARD_MIOPEN_RETURNS_HPP +// Forwarding header — implementation moved to common_utils. #include -#endif diff --git a/projects/miopen/src/include/miopen/stringutils.hpp b/projects/miopen/src/include/miopen/stringutils.hpp index 38f52efd1cf6..168eb6bee75e 100644 --- a/projects/miopen/src/include/miopen/stringutils.hpp +++ b/projects/miopen/src/include/miopen/stringutils.hpp @@ -1,5 +1,2 @@ // Forwarding header — implementation moved to common_utils. -#ifndef GUARD_MIOPEN_STRINGUTILS_HPP -#define GUARD_MIOPEN_STRINGUTILS_HPP #include -#endif diff --git a/projects/miopen/src/include/miopen/type_name.hpp b/projects/miopen/src/include/miopen/type_name.hpp index d2cce63d3d32..4f4afd78def0 100644 --- a/projects/miopen/src/include/miopen/type_name.hpp +++ b/projects/miopen/src/include/miopen/type_name.hpp @@ -1,30 +1,2 @@ -/******************************************************************************* - * - * MIT License - * - * Copyright (c) 2017-2024 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - *******************************************************************************/ -// Forwarding header -- implementation moved to common_utils. -#ifndef GUARD_TYPE_NAME_HPP -#define GUARD_TYPE_NAME_HPP +// Forwarding header — implementation moved to common_utils. #include -#endif diff --git a/projects/miopen/test/cpu_bias.hpp b/projects/miopen/test/cpu_bias.hpp index 4b150035d5c0..2abbcccde0da 100644 --- a/projects/miopen/test/cpu_bias.hpp +++ b/projects/miopen/test/cpu_bias.hpp @@ -1,5 +1,2 @@ // Forwarding header — implementation moved to miopen_utils. -#ifndef GUARD_CPU_BIAS_HPP -#define GUARD_CPU_BIAS_HPP #include -#endif diff --git a/projects/miopen/test/cpu_conv.hpp b/projects/miopen/test/cpu_conv.hpp index fac5227efe75..818e215c45e2 100644 --- a/projects/miopen/test/cpu_conv.hpp +++ b/projects/miopen/test/cpu_conv.hpp @@ -1,5 +1,2 @@ // Forwarding header — implementation moved to miopen_utils. -#ifndef GUARD_CPU_CONV_HPP -#define GUARD_CPU_CONV_HPP #include -#endif diff --git a/projects/miopen/test/cpu_layernorm.hpp b/projects/miopen/test/cpu_layernorm.hpp index 9f1c7a55ba42..a9f7b139484c 100644 --- a/projects/miopen/test/cpu_layernorm.hpp +++ b/projects/miopen/test/cpu_layernorm.hpp @@ -1,5 +1,2 @@ // Forwarding header — implementation moved to miopen_utils. -#ifndef GUARD_CPU_CONV_HPP -#define GUARD_CPU_CONV_HPP #include -#endif diff --git a/projects/miopen/test/cpu_reduce_util.hpp b/projects/miopen/test/cpu_reduce_util.hpp index 73de3b18e2e1..401dd20b994b 100644 --- a/projects/miopen/test/cpu_reduce_util.hpp +++ b/projects/miopen/test/cpu_reduce_util.hpp @@ -1,5 +1,2 @@ // Forwarding header — implementation moved to miopen_utils. -#ifndef GUARD_CPU_REDUCE_UTIL_HPP -#define GUARD_CPU_REDUCE_UTIL_HPP #include -#endif diff --git a/projects/miopen/test/fusionHost.hpp b/projects/miopen/test/fusionHost.hpp index a13ee5601cd4..c95d14da6f82 100644 --- a/projects/miopen/test/fusionHost.hpp +++ b/projects/miopen/test/fusionHost.hpp @@ -1,3 +1,2 @@ // Forwarding header — implementation moved to miopen_utils. -#pragma once #include diff --git a/projects/miopen/test/gemm.hpp b/projects/miopen/test/gemm.hpp index 34fa7db11bec..be0195545352 100644 --- a/projects/miopen/test/gemm.hpp +++ b/projects/miopen/test/gemm.hpp @@ -1,5 +1,2 @@ // Forwarding header — implementation moved to miopen_utils. -#ifndef GUARD_GEMM_HPP -#define GUARD_GEMM_HPP #include -#endif diff --git a/projects/miopen/test/network_data.hpp b/projects/miopen/test/network_data.hpp index 7a0dbcd702dd..18e85973ef3f 100644 --- a/projects/miopen/test/network_data.hpp +++ b/projects/miopen/test/network_data.hpp @@ -1,5 +1,2 @@ // Forwarding header — implementation moved to miopen_utils. -#ifndef GUARD_MIOPEN_TEST_NETWORK_DATA_HPP -#define GUARD_MIOPEN_TEST_NETWORK_DATA_HPP #include -#endif diff --git a/projects/miopen/test/random.hpp b/projects/miopen/test/random.hpp index 7c5c0efa5962..3bb99a37d6c9 100644 --- a/projects/miopen/test/random.hpp +++ b/projects/miopen/test/random.hpp @@ -1,5 +1,2 @@ // Forwarding header — implementation moved to miopen_utils. -#ifndef GUARD_MIOPEN_TEST_RANDOM_HPP -#define GUARD_MIOPEN_TEST_RANDOM_HPP #include -#endif diff --git a/projects/miopen/test/rnn_util.hpp b/projects/miopen/test/rnn_util.hpp index 2a25f35e61a8..0e771bfdfff1 100644 --- a/projects/miopen/test/rnn_util.hpp +++ b/projects/miopen/test/rnn_util.hpp @@ -1,5 +1,2 @@ // Forwarding header — implementation moved to miopen_utils. -#ifndef MIOPEN_RNN_UTIL_H_ -#define MIOPEN_RNN_UTIL_H_ #include -#endif diff --git a/projects/miopen/test/serialize.hpp b/projects/miopen/test/serialize.hpp index b9e948307a1e..c3eb459c38df 100644 --- a/projects/miopen/test/serialize.hpp +++ b/projects/miopen/test/serialize.hpp @@ -1,5 +1,2 @@ // Forwarding header — implementation moved to miopen_utils. -#ifndef MIOPEN_GUARD_TEST_SERIALIZE_HPP -#define MIOPEN_GUARD_TEST_SERIALIZE_HPP #include -#endif diff --git a/projects/miopen/test/tensor_holder.hpp b/projects/miopen/test/tensor_holder.hpp index 5f075eb9b528..bc10b5a8b12d 100644 --- a/projects/miopen/test/tensor_holder.hpp +++ b/projects/miopen/test/tensor_holder.hpp @@ -1,5 +1,2 @@ // Forwarding header — implementation moved to miopen_utils. -#ifndef GUARD_TENSOR_HOLDER_HPP -#define GUARD_TENSOR_HOLDER_HPP #include -#endif diff --git a/projects/miopen/test/verify.hpp b/projects/miopen/test/verify.hpp index 2bf12f1057a3..8807b5ecfe2b 100644 --- a/projects/miopen/test/verify.hpp +++ b/projects/miopen/test/verify.hpp @@ -1,5 +1,2 @@ // Forwarding header — implementation moved to miopen_utils. -#ifndef GUARD_VERIFY_HPP -#define GUARD_VERIFY_HPP #include -#endif From 64e5d404aa3c16a957140e4c5be59a990ecaac26 Mon Sep 17 00:00:00 2001 From: Brad Pepers Date: Sat, 9 May 2026 12:10:10 -0600 Subject: [PATCH 07/11] Add missing common_utils/miopen_utils linkage to gtest targets The forwarding headers in src/include/miopen/ include but the gtest build targets were not linking miopen_common_utils, so the include directory was not on the search path. This caused build failures for all gtest targets. Co-Authored-By: Claude Sonnet 4 --- projects/miopen/test/gtest/CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/projects/miopen/test/gtest/CMakeLists.txt b/projects/miopen/test/gtest/CMakeLists.txt index af74113fa312..dfdb6ef4630e 100644 --- a/projects/miopen/test/gtest/CMakeLists.txt +++ b/projects/miopen/test/gtest/CMakeLists.txt @@ -81,7 +81,7 @@ function(add_gtest TEST_NAME TEST_CPP) # Workaround : change in rocm-cmake was causing linking error so had to add ${CMAKE_DL_LIBS} # We can remove ${CMAKE_DL_LIBS} once root cause is identified. # MIOpen_with_plugins ensures CK plugin .so's are built alongside the test - target_link_libraries(${TEST_NAME} ${CMAKE_DL_LIBS} GTest::gtest GTest::gtest_main GTest::gmock MIOpen_with_plugins hip::host ) + target_link_libraries(${TEST_NAME} ${CMAKE_DL_LIBS} GTest::gtest GTest::gtest_main GTest::gmock MIOpen_with_plugins hip::host miopen_common_utils miopen_utils) if(NOT MIOPEN_EMBED_DB STREQUAL "") target_link_libraries(${TEST_NAME} $) endif() @@ -211,7 +211,7 @@ endforeach() # Otherwise, all files in ${SOURCES} are rebuilt for each test. add_library(miopen_gtest_common STATIC ${SOURCES}) target_include_directories(miopen_gtest_common PRIVATE ../ ../../src/kernels) -target_link_libraries(miopen_gtest_common PRIVATE GTest::gtest GTest::gmock MIOpen) +target_link_libraries(miopen_gtest_common PRIVATE GTest::gtest GTest::gmock MIOpen miopen_common_utils miopen_utils) if(WIN32) # Refer to https://en.cppreference.com/w/cpp/language/types for details. target_compile_options(miopen_gtest_common PRIVATE $:-U__LP64__>>) From 896cbbc95bb4b6e7b9fd405e2e8dd9b72b90ff45 Mon Sep 17 00:00:00 2001 From: Brad Pepers Date: Sat, 9 May 2026 13:04:46 -0600 Subject: [PATCH 08/11] Add missing common_utils linkage to ck_impl and speedtest targets Same fix as the previous gtest commit: the forwarding headers in src/include/miopen/ resolve to common_utils/ headers, so any target that includes MIOpen internals needs miopen_common_utils on its include path. Co-Authored-By: Claude Sonnet 4 --- projects/miopen/speedtests/CMakeLists.txt | 2 +- projects/miopen/src/ck_impl/CMakeLists.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/projects/miopen/speedtests/CMakeLists.txt b/projects/miopen/speedtests/CMakeLists.txt index 9aa89974cc75..826da17b59db 100644 --- a/projects/miopen/speedtests/CMakeLists.txt +++ b/projects/miopen/speedtests/CMakeLists.txt @@ -16,7 +16,7 @@ function(add_speedtest_executable TEST_NAME) endif() separate_arguments(MIOPEN_TEST_FLAGS_ARGS NATIVE_COMMAND ${MIOPEN_TEST_FLAGS}) # MIOpen_with_plugins ensures CK plugin .so's are built alongside the speedtest - target_link_libraries(${TEST_NAME} MIOpen_with_plugins) + target_link_libraries(${TEST_NAME} MIOpen_with_plugins miopen_common_utils miopen_utils) target_include_directories(${TEST_NAME} PRIVATE ../test ../src/kernels) endfunction(add_speedtest_executable) diff --git a/projects/miopen/src/ck_impl/CMakeLists.txt b/projects/miopen/src/ck_impl/CMakeLists.txt index ae380f174007..791250958533 100644 --- a/projects/miopen/src/ck_impl/CMakeLists.txt +++ b/projects/miopen/src/ck_impl/CMakeLists.txt @@ -145,7 +145,7 @@ foreach(gpu_target IN LISTS _CK_FILTERED_TARGETS) target_link_libraries(${lib_name} PRIVATE hip::device) # Link against MIOpen for shared types (ConvSolution, InvokerFactory, etc.) - target_link_libraries(${lib_name} PRIVATE MIOpen) + target_link_libraries(${lib_name} PRIVATE MIOpen miopen_common_utils) # Install alongside MIOpen install(TARGETS ${lib_name} From 70381cf345ad81221ffbdf4faa3ce066eb4919c1 Mon Sep 17 00:00:00 2001 From: Brad Pepers Date: Sat, 9 May 2026 16:19:47 -0600 Subject: [PATCH 09/11] Fix unified miopen_gtest build: restore lost transitive includes The forwarding headers and removed driver/ cross-includes broke 8 test files in the unified miopen_gtest binary: - test/fusionHost.hpp: add back get_handle.hpp include that the miopen_utils version correctly omits but test code depends on - reduceextreme.hpp, reducecalculation.hpp: move miopen/miopen.h before kernel headers that static_assert on its macros - layout_transpose.cpp: add float16 typedef lost when the driver/conv_common.hpp cross-include was removed Co-Authored-By: Claude Sonnet 4 --- projects/miopen/test/fusionHost.hpp | 1 + projects/miopen/test/gtest/layout_transpose.cpp | 2 ++ projects/miopen/test/gtest/reducecalculation.hpp | 2 +- projects/miopen/test/gtest/reduceextreme.hpp | 2 +- 4 files changed, 5 insertions(+), 2 deletions(-) diff --git a/projects/miopen/test/fusionHost.hpp b/projects/miopen/test/fusionHost.hpp index c95d14da6f82..11c6d54f6257 100644 --- a/projects/miopen/test/fusionHost.hpp +++ b/projects/miopen/test/fusionHost.hpp @@ -1,2 +1,3 @@ // Forwarding header — implementation moved to miopen_utils. #include +#include "get_handle.hpp" diff --git a/projects/miopen/test/gtest/layout_transpose.cpp b/projects/miopen/test/gtest/layout_transpose.cpp index b4c86a99846a..b688d17b2aa7 100644 --- a/projects/miopen/test/gtest/layout_transpose.cpp +++ b/projects/miopen/test/gtest/layout_transpose.cpp @@ -37,6 +37,8 @@ #include +using float16 = half_float::half; + namespace { template diff --git a/projects/miopen/test/gtest/reducecalculation.hpp b/projects/miopen/test/gtest/reducecalculation.hpp index 94b70ac8a1ea..3b2de8465c0c 100644 --- a/projects/miopen/test/gtest/reducecalculation.hpp +++ b/projects/miopen/test/gtest/reducecalculation.hpp @@ -24,13 +24,13 @@ * *******************************************************************************/ +#include #include "../src/kernels/MIOpenReduceCalculation.hpp" #include "get_handle.hpp" #include "random.hpp" #include "tensor_holder.hpp" #include "verify.hpp" #include -#include #include template diff --git a/projects/miopen/test/gtest/reduceextreme.hpp b/projects/miopen/test/gtest/reduceextreme.hpp index 4d2658a39569..0c2cde8c7564 100644 --- a/projects/miopen/test/gtest/reduceextreme.hpp +++ b/projects/miopen/test/gtest/reduceextreme.hpp @@ -24,6 +24,7 @@ * *******************************************************************************/ +#include #include "../src/kernels/MIOpenReduceExtreme.hpp" #include "get_handle.hpp" #include "random.hpp" @@ -31,7 +32,6 @@ #include "verify.hpp" #include #include -#include template bool compare_equal(T r1, T r2) From c4e8de36534de8f562bca7001438564978308637 Mon Sep 17 00:00:00 2001 From: Brad Pepers Date: Sun, 10 May 2026 14:29:48 -0600 Subject: [PATCH 10/11] Clarify common_utils and miopen_utils are internal build-only libraries Remove MIOpen:: namespace aliases (implies installed/exported targets) and add EXCLUDE_FROM_ALL to both INTERFACE libraries. Strengthen CMake comments to be explicit that these are not installed, not exported, and not part of the public MIOpen API. No functional or build behavior changes. Co-Authored-By: Claude Sonnet 4 --- projects/miopen/common_utils/CMakeLists.txt | 9 ++++++--- projects/miopen/miopen_utils/CMakeLists.txt | 8 +++++--- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/projects/miopen/common_utils/CMakeLists.txt b/projects/miopen/common_utils/CMakeLists.txt index 1afb185255c9..7bb6572ee7ac 100644 --- a/projects/miopen/common_utils/CMakeLists.txt +++ b/projects/miopen/common_utils/CMakeLists.txt @@ -24,13 +24,16 @@ # ################################################################################ -# Header-only utility library shared by MIOpen, MIOpenDriver, and tests. -# Contains pure C++ utilities with NO MIOpen or GPU dependencies. +# INTERNAL BUILD-ONLY library — not installed, not exported, not part of the public MIOpen API. +# Header-only pure C++ utilities shared by MIOpen, MIOpenDriver, and tests. +# Contains NO MIOpen or GPU dependencies. +# Do NOT add install(TARGETS miopen_common_utils ...) — headers live in the build tree only. add_library(miopen_common_utils INTERFACE) -add_library(MIOpen::common_utils ALIAS miopen_common_utils) +set_target_properties(miopen_common_utils PROPERTIES EXCLUDE_FROM_ALL TRUE) target_include_directories(miopen_common_utils INTERFACE + # BUILD_INTERFACE only — no install interface; these headers are not installed. $ ) diff --git a/projects/miopen/miopen_utils/CMakeLists.txt b/projects/miopen/miopen_utils/CMakeLists.txt index 47e61c063411..e93a717d0a0e 100644 --- a/projects/miopen/miopen_utils/CMakeLists.txt +++ b/projects/miopen/miopen_utils/CMakeLists.txt @@ -24,14 +24,16 @@ # ################################################################################ -# Utility library for MIOpen test/verification code shared by MIOpenDriver and tests. +# INTERNAL BUILD-ONLY library — not installed, not exported, not part of the public MIOpen API. +# Shared verification/test utilities for MIOpenDriver and tests. # Depends on common_utils and the MIOpen public API (miopen.h). -# Phase 1: May still use MIOpen internal headers temporarily. +# Do NOT add install(TARGETS miopen_utils ...) — headers live in the build tree only. add_library(miopen_utils INTERFACE) -add_library(MIOpen::miopen_utils ALIAS miopen_utils) +set_target_properties(miopen_utils PROPERTIES EXCLUDE_FROM_ALL TRUE) target_include_directories(miopen_utils INTERFACE + # BUILD_INTERFACE only — no install interface; these headers are not installed. $ ) From 8d14b9691b2820e244f08670f57b123329d285e3 Mon Sep 17 00:00:00 2001 From: Brad Pepers Date: Sun, 10 May 2026 14:44:09 -0600 Subject: [PATCH 11/11] Move MIOPEN_USE_RNE_BFLOAT16 option to top-level CMakeLists.txt The option was declared in two places (common_utils/ and src/), with the src/ declaration being a silent no-op since common_utils/ runs first. Move the single option() declaration to the top-level CMakeLists.txt, which is the canonical location for all project-wide MIOpen build options. Both common_utils/ and src/ now consume it from the CMake cache without re-declaring it. No functional or build behavior changes. Co-Authored-By: Claude Sonnet 4 --- projects/miopen/CMakeLists.txt | 7 +++++++ projects/miopen/common_utils/CMakeLists.txt | 5 ++--- projects/miopen/src/CMakeLists.txt | 9 +-------- 3 files changed, 10 insertions(+), 11 deletions(-) diff --git a/projects/miopen/CMakeLists.txt b/projects/miopen/CMakeLists.txt index 26bf20fd0690..57089253f3e1 100644 --- a/projects/miopen/CMakeLists.txt +++ b/projects/miopen/CMakeLists.txt @@ -110,6 +110,13 @@ if(MIOPEN_INCBIN) enable_language(ASM) endif() +# Truncation rounding or (default) rounding to nearest even (RNE) is enabled. +# This switch controls two related but different aspects of MIOpen behavior: +# 1. How host code performs conversions of float to bfloat16 (important for testing). +# 2. How BF16 kernels perform the final conversion (and rounding) of FP32 to BF16 results +# (affects the main functionality of the library). +option(MIOPEN_USE_RNE_BFLOAT16 "Sets rounding scheme for bfloat16 type" ON) + # Strip symbols for release if(MIOPEN_STRIP_SYMBOLS AND NOT WIN32 AND NOT APPLE) set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -s") diff --git a/projects/miopen/common_utils/CMakeLists.txt b/projects/miopen/common_utils/CMakeLists.txt index 7bb6572ee7ac..d538ef6ef258 100644 --- a/projects/miopen/common_utils/CMakeLists.txt +++ b/projects/miopen/common_utils/CMakeLists.txt @@ -37,9 +37,8 @@ target_include_directories(miopen_common_utils INTERFACE $ ) -# bfloat16.hpp needs to know the rounding mode. -# This option is also defined in src/CMakeLists.txt for backward compatibility. -option(MIOPEN_USE_RNE_BFLOAT16 "Sets rounding scheme for bfloat16 type" ON) +# bfloat16.hpp needs MIOPEN_USE_RNE_BFLOAT16 at compile time. +# The option is declared in the top-level CMakeLists.txt. if(MIOPEN_USE_RNE_BFLOAT16) target_compile_definitions(miopen_common_utils INTERFACE MIOPEN_USE_RNE_BFLOAT16=1) else() diff --git a/projects/miopen/src/CMakeLists.txt b/projects/miopen/src/CMakeLists.txt index 3ba48b6ca763..84bbd53716fb 100644 --- a/projects/miopen/src/CMakeLists.txt +++ b/projects/miopen/src/CMakeLists.txt @@ -8,14 +8,7 @@ if(MIOPEN_ENABLE_SQLITE) add_subdirectory(sqlite) endif() -# Truncation rounding or (default) rounding to nearest even (RNE) is enabled. -# This switch controls two related but different aspects of MIOpen behavior -# 1. How host code performs conversions of float to bfloat16, important only -# for testing. -# 2. How BF16 kernels (which are kind of mixed-precision now and expected to -# remain in the future) perform final conversion (and rounding) of FP32 -# to BF16 results. This affects the main functionality of the library. -option( MIOPEN_USE_RNE_BFLOAT16 "Sets rounding scheme for bfloat16 type" ON ) +# MIOPEN_USE_RNE_BFLOAT16 is declared in the top-level CMakeLists.txt. option( MIOPEN_FP8_IEEE_EXPONENT_BIAS "Sets the FP8 exponent bias to IEEE" OFF) option( MIOPEN_FP8_CLIPPING "Sets the FP8 clipping" ON)