diff --git a/projects/miopen/CMakeLists.txt b/projects/miopen/CMakeLists.txt index af87cd1c7e16..57089253f3e1 100644 --- a/projects/miopen/CMakeLists.txt +++ b/projects/miopen/CMakeLists.txt @@ -110,6 +110,13 @@ if(MIOPEN_INCBIN) enable_language(ASM) endif() +# Truncation rounding or (default) rounding to nearest even (RNE) is enabled. +# This switch controls two related but different aspects of MIOpen behavior: +# 1. How host code performs conversions of float to bfloat16 (important for testing). +# 2. How BF16 kernels perform the final conversion (and rounding) of FP32 to BF16 results +# (affects the main functionality of the library). +option(MIOPEN_USE_RNE_BFLOAT16 "Sets rounding scheme for bfloat16 type" ON) + # Strip symbols for release if(MIOPEN_STRIP_SYMBOLS AND NOT WIN32 AND NOT APPLE) set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -s") @@ -894,8 +901,10 @@ set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/bin) if(NOT MIOPEN_USE_SQLITE_PERFDB) add_subdirectory(tools/sqlite2txt) endif() +add_subdirectory(common_utils) add_subdirectory(addkernels) add_subdirectory(src) +add_subdirectory(miopen_utils) if(MIOPEN_BUILD_DRIVER) add_subdirectory(driver) endif() diff --git a/projects/miopen/common_utils/CMakeLists.txt b/projects/miopen/common_utils/CMakeLists.txt new file mode 100644 index 000000000000..d538ef6ef258 --- /dev/null +++ b/projects/miopen/common_utils/CMakeLists.txt @@ -0,0 +1,46 @@ +################################################################################ +# +# MIT License +# +# Copyright (c) 2025 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +################################################################################ + +# INTERNAL BUILD-ONLY library — not installed, not exported, not part of the public MIOpen API. +# Header-only pure C++ utilities shared by MIOpen, MIOpenDriver, and tests. +# Contains NO MIOpen or GPU dependencies. +# Do NOT add install(TARGETS miopen_common_utils ...) — headers live in the build tree only. + +add_library(miopen_common_utils INTERFACE) +set_target_properties(miopen_common_utils PROPERTIES EXCLUDE_FROM_ALL TRUE) + +target_include_directories(miopen_common_utils INTERFACE + # BUILD_INTERFACE only — no install interface; these headers are not installed. + $ +) + +# bfloat16.hpp needs MIOPEN_USE_RNE_BFLOAT16 at compile time. +# The option is declared in the top-level CMakeLists.txt. +if(MIOPEN_USE_RNE_BFLOAT16) + target_compile_definitions(miopen_common_utils INTERFACE MIOPEN_USE_RNE_BFLOAT16=1) +else() + target_compile_definitions(miopen_common_utils INTERFACE MIOPEN_USE_RNE_BFLOAT16=0) +endif() diff --git a/projects/miopen/common_utils/include/common_utils/algorithm.hpp b/projects/miopen/common_utils/include/common_utils/algorithm.hpp new file mode 100644 index 000000000000..d1098a066077 --- /dev/null +++ b/projects/miopen/common_utils/include/common_utils/algorithm.hpp @@ -0,0 +1,47 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2019 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#ifndef GUARD_MLOPEN_ALGORITHM_HPP +#define GUARD_MLOPEN_ALGORITHM_HPP + +#include + +namespace miopen { + +template +bool any_of(const Range& r, Predicate p) +{ + return std::any_of(r.begin(), r.end(), p); +} + +template +bool all_of(const Range& r, Predicate p) +{ + return std::all_of(r.begin(), r.end(), p); +} + +} // namespace miopen + +#endif diff --git a/projects/miopen/common_utils/include/common_utils/bfloat16.hpp b/projects/miopen/common_utils/include/common_utils/bfloat16.hpp new file mode 100644 index 000000000000..71fe70bbd3c7 --- /dev/null +++ b/projects/miopen/common_utils/include/common_utils/bfloat16.hpp @@ -0,0 +1,179 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + +#ifndef BFLOAT16_H_ +#define BFLOAT16_H_ + +#include +// MIOPEN_USE_RNE_BFLOAT16 is provided via CMake compile definitions. + +class bfloat16 +{ +public: + bfloat16() : data_{0} {} + explicit bfloat16(float rhs) + { + union + { + float float_st; + std::uint32_t bf16_st; + } bits_st = {rhs}; + + // BF16 round and NaN preservation code matches + // https://github.com/ROCm/rocBLAS/blob/develop/library/include/rocblas_bfloat16.h + if((~bits_st.bf16_st & 0x7f800000) == 0) // Inf or NaN + { + // When all of the exponent bits are 1, the value is Inf or NaN. + // Inf is indicated by a zero mantissa. NaN is indicated by any nonzero + // mantissa bit. Quiet NaN is indicated by the most significant mantissa + // bit being 1. Signaling NaN is indicated by the most significant + // mantissa bit being 0 but some other bit(s) being 1. If any of the + // lower 16 bits of the mantissa are 1, we set the least significant bit + // of the bfloat16 mantissa, in order to preserve signaling NaN in case + // the bloat16's mantissa bits are all 0. + if((bits_st.bf16_st & 0xffff) != 0) + { + bits_st.bf16_st |= 0x10000; // Preserve signaling NaN + } + } + else + { +#if MIOPEN_USE_RNE_BFLOAT16 == 1 + // When the exponent bits are not all 1s, then the value is zero, normal, + // or subnormal. We round the bfloat16 mantissa up by adding 0x7FFF, plus + // 1 if the least significant bit of the bfloat16 mantissa is 1 (odd). + // This causes the bfloat16's mantissa to be incremented by 1 if the 16 + // least significant bits of the float mantissa are greater than 0x8000, + // or if they are equal to 0x8000 and the least significant bit of the + // bfloat16 mantissa is 1 (odd). This causes it to be rounded to even when + // the lower 16 bits are exactly 0x8000. If the bfloat16 mantissa already + // has the value 0x7f, then incrementing it causes it to become 0x00 and + // the exponent is incremented by one, which is the next higher FP value + // to the unrounded bfloat16 value. When the bfloat16 value is subnormal + // with an exponent of 0x00 and a mantissa of 0x7F, it may be rounded up + // to a normal value with an exponent of 0x01 and a mantissa of 0x00. + // When the bfloat16 value has an exponent of 0xFE and a mantissa of 0x7F, + // incrementing it causes it to become an exponent of 0xFF and a mantissa + // of 0x00, which is Inf, the next higher value to the unrounded value. + bits_st.bf16_st += + (0x7fff + ((bits_st.bf16_st >> 16) & 1)); // Round to nearest, round to even +#else // truncation +// do nothing +#endif + } + data_ = bits_st.bf16_st >> 16; + } + operator float() const + { + union + { + std::uint32_t bf16_st; + float float_st; + } bits_st = {data_}; + + bits_st.bf16_st = bits_st.bf16_st << 16; + return bits_st.float_st; + } + + bfloat16 operator-() const { return bfloat16(-static_cast(*this)); } + bfloat16 operator+() const { return *this; } + + bfloat16& operator=(const float rhs) + { + *this = bfloat16(rhs); + return *this; + } + bfloat16& operator+=(bfloat16 rhs) + { + *this = bfloat16(static_cast(*this) + static_cast(rhs)); + return *this; + } + + bfloat16& operator+=(float rhs) + { + *this = bfloat16(static_cast(*this) + rhs); + return *this; + } + + bfloat16& operator-=(bfloat16 rhs) + { + *this += -rhs; + return *this; + } + bfloat16& operator*=(bfloat16 rhs) + { + *this = bfloat16(static_cast(*this) * static_cast(rhs)); + return *this; + } + bfloat16& operator*=(float rhs) + { + *this = bfloat16(static_cast(*this) * rhs); + return *this; + } + + bfloat16& operator/=(bfloat16 rhs) + { + *this = bfloat16(static_cast(*this) / static_cast(rhs)); + return *this; + } + bool operator<(bfloat16 rhs) const + { + return static_cast(*this) < static_cast(rhs); + } + bool operator==(bfloat16 rhs) const { return std::equal_to()(*this, rhs); } + + static constexpr bfloat16 generate(uint16_t val) { return bfloat16{val, true}; } + +private: + constexpr bfloat16(std::uint16_t val, bool) : data_{val} {} + + std::uint16_t data_; +}; + +inline bfloat16 operator+(bfloat16 a, const bfloat16& b) +{ + a += b; + return a; +} + +inline bfloat16 operator-(bfloat16 a, const bfloat16& b) +{ + a -= b; + return a; +} + +inline bfloat16 operator*(bfloat16 a, const bfloat16& b) +{ + a *= b; + return a; +} + +inline bfloat16 operator/(bfloat16 a, const bfloat16& b) +{ + a /= b; + return a; +} + +namespace std { +template <> +class numeric_limits +{ +public: + static constexpr bool is_specialized = true; + static constexpr bfloat16 min() noexcept { return bfloat16::generate(0x0080); } // 0x1.00p-126 + static constexpr bfloat16 max() noexcept { return bfloat16::generate(0x7F7F); } + static constexpr bfloat16 lowest() noexcept { return bfloat16::generate(0xFF7F); } + static constexpr bfloat16 epsilon() noexcept { return bfloat16::generate(0x3C00); } + static constexpr bfloat16 infinity() noexcept { return bfloat16::generate(0x7F80); } + static constexpr bfloat16 quiet_NaN() noexcept { return bfloat16::generate(0x7FC0); } // qnan(0) + static constexpr bfloat16 signaling_NaN() noexcept + { + return bfloat16::generate(0x7F81); // snan(1) + } + static constexpr bfloat16 denorm_min() noexcept + { + return bfloat16::generate(0x0001); // 0x0.02p-126 + } +}; +} // namespace std +#endif diff --git a/projects/miopen/common_utils/include/common_utils/each_args.hpp b/projects/miopen/common_utils/include/common_utils/each_args.hpp new file mode 100644 index 000000000000..e078153dc998 --- /dev/null +++ b/projects/miopen/common_utils/include/common_utils/each_args.hpp @@ -0,0 +1,79 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2017 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#ifndef GUARD_MIOPEN_EACH_ARGS_HPP +#define GUARD_MIOPEN_EACH_ARGS_HPP + +#include +#include +#include + +namespace miopen { +namespace detail { + +template +void each_args_i_impl(F f, std::index_sequence, Ts&&... xs) +{ + (void)std::initializer_list{ + (f(std::integral_constant{}, std::forward(xs)), 0)...}; +} + +template +auto unpack_impl(F f, std::index_sequence, T&& x) +{ + return f(std::get(x)...); +} + +} // namespace detail + +template +void each_args_i(F f, Ts&&... xs) +{ + detail::each_args_i_impl(f, std::make_index_sequence(), std::forward(xs)...); +} + +template +void each_args(F f, Ts&&... xs) +{ + (void)std::initializer_list{(f(std::forward(xs)), 0)...}; +} + +// Workaround for gcc warnings +template +void each_args(F) +{ +} + +template +auto unpack(F f, T&& x) +{ + using type = typename std::remove_cv::type>::type; + return detail::unpack_impl( + f, std::make_index_sequence::value>(), std::forward(x)); +} + +} // namespace miopen + +#endif diff --git a/projects/miopen/common_utils/include/common_utils/float_equal.hpp b/projects/miopen/common_utils/include/common_utils/float_equal.hpp new file mode 100644 index 000000000000..24bbdc55ad11 --- /dev/null +++ b/projects/miopen/common_utils/include/common_utils/float_equal.hpp @@ -0,0 +1,89 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2017 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#ifndef GUARD_MLOPEN_FLOAT_EQUAL_HPP +#define GUARD_MLOPEN_FLOAT_EQUAL_HPP + +#include +#include +#include +#include + +namespace miopen { + +template +using common_type = typename std::common_type::type; + +struct float_equal_fn +{ + template + static bool apply(T x, T y) + { + // The standard library from MSVC does not implement std::isfinite() for integer + // types - no additional overloads are provided. According to the documentation, + // integer types should be treaded as doubles. + // Refer to https://en.cppreference.com/w/cpp/numeric/math/isfinite for more information. + return std::isfinite(static_cast(x)) and std::isfinite(static_cast(y)) and + std::nextafter(x, std::numeric_limits::lowest()) <= y and + std::nextafter(x, std::numeric_limits::max()) >= y; + } + + template + bool operator()(T x, U y) const + { + return float_equal_fn::apply>(x, y); + } +}; + +static constexpr float_equal_fn float_equal{}; + +/// Special case for comparing with a sentinel value +struct float_equal_sentinel_fn +{ + template + static bool apply(T x, T y) + { +// In this case we have to ignore this warning, because we intend to compare with the exact value +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wfloat-equal" + bool equals_sentinel = x == y; +#pragma clang diagnostic pop + + return std::isfinite(static_cast(x)) and std::isfinite(static_cast(y)) and + equals_sentinel; + } + + template + bool operator()(T x, U y) const + { + return float_equal_sentinel_fn::apply>(x, y); + } +}; + +static constexpr float_equal_sentinel_fn float_equal_sentinel{}; + +} // namespace miopen + +#endif diff --git a/projects/miopen/common_utils/include/common_utils/ford.hpp b/projects/miopen/common_utils/include/common_utils/ford.hpp new file mode 100644 index 000000000000..4ff4ddfa32e2 --- /dev/null +++ b/projects/miopen/common_utils/include/common_utils/ford.hpp @@ -0,0 +1,122 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2017 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#ifndef GUARD_FORD_HPP +#define GUARD_FORD_HPP + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +namespace miopen { + +// An improved async, that doesn't block +template +std::future::type> detach_async(Function&& f) +{ + using result_type = typename std::invoke_result::type; + std::packaged_task task(std::forward(f)); + auto fut = task.get_future(); + std::thread(std::move(task)).detach(); + return fut; +} + +template +auto then(std::future f, Work w) -> std::future +{ + return std::async(std::launch::deferred, + [=, f_ = std::move(f)]() mutable { return w(f_.get()); }); +} + +template +struct ford_wrapper +{ + template + auto operator()(Ts... xs) const MIOPEN_RETURNS(std::bind(T{}, std::placeholders::_1, xs...)); +}; + +// Multidimensional for loop +struct ford_impl +{ + template + void operator()(F f) const + { + f(); + } + + template + void operator()(F f, T x, Ts... xs) const + { + // Workaround for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=55914 + for(T i = 0; i < x; i++) + { + (*this)([&](Ts... is) { f(i, is...); }, xs...); + } + } +}; + +static constexpr ford_wrapper ford{}; + +struct par_ford_impl +{ + template + void operator()(F f, Ts... xs) const + { + using array_type = std::array; + array_type lens = {{static_cast(xs)...}}; + array_type strides; + strides.fill(1); + std::partial_sum( + lens.rbegin(), lens.rend() - 1, strides.rbegin() + 1, std::multiplies()); + auto size = std::accumulate( + lens.begin(), lens.end(), static_cast(1), std::multiplies()); + par_for(size, [&](std::size_t i) { + array_type indices; + std::transform(strides.begin(), + strides.end(), + lens.begin(), + indices.begin(), + [&](size_t stride, size_t len) { return (i / stride) % len; }); + unpack(f, indices); + }); + } +}; + +static constexpr ford_wrapper par_ford{}; + +} // namespace miopen + +#endif diff --git a/projects/miopen/common_utils/include/common_utils/functional.hpp b/projects/miopen/common_utils/include/common_utils/functional.hpp new file mode 100644 index 000000000000..19dde2bd28dc --- /dev/null +++ b/projects/miopen/common_utils/include/common_utils/functional.hpp @@ -0,0 +1,131 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2017 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#ifndef GUARD_MLOPEN_FUNCTIONAL_HPP +#define GUARD_MLOPEN_FUNCTIONAL_HPP + +#include +#include +#include + +namespace miopen { +namespace detail { + +template +auto each_i_impl(F f, std::index_sequence) + MIOPEN_RETURNS(f(std::integral_constant{}...)); +} // namespace detail + +template +struct by_t +{ + F f; + P p; + template + auto operator()(Ts&&... xs) const MIOPEN_RETURNS(f(p(std::forward(xs))...)) +}; + +template +by_t by(F f, P p) +{ + return {std::move(f), std::move(p)}; +} + +template +struct compose_t +{ + F f; + G g; + template + auto operator()(Ts&&... xs) const MIOPEN_RETURNS(f(g(std::forward(xs)...))) +}; + +template +compose_t compose(F f, G g) +{ + return {std::move(f), std::move(g)}; +} + +template +struct flip_t +{ + F f; + template + auto operator()(T&& x, U&& y) const MIOPEN_RETURNS(f(std::forward(y), std::forward(x))) +}; + +template +flip_t flip(F f) +{ + return {std::move(f)}; +} + +template +struct sequence_t +{ + F f; + template + auto operator()(IntegralConstant) const + MIOPEN_RETURNS(detail::each_i_impl(f, std::make_index_sequence())); +}; + +template +sequence_t sequence(F f) +{ + return {std::move(f)}; +} + +template +void repeat_n(F f, std::integral_constant) +{ + auto fs = [&f](auto... is) { return each_args(f, is...); }; + sequence(fs)(std::integral_constant{}); +} + +template +struct cast_to +{ + template + T operator()(X&& x) const + { + return static_cast(std::forward(x)); + } +}; + +template +auto unpacker(F f) +{ + return [=](auto xs) { return miopen::unpack(f, xs); }; +}; + +template +auto prepender(F f, Xs... xs) +{ + return [=](auto... ys) { return f(xs..., ys...); }; +} + +} // namespace miopen + +#endif diff --git a/projects/miopen/common_utils/include/common_utils/par_for.hpp b/projects/miopen/common_utils/include/common_utils/par_for.hpp new file mode 100644 index 000000000000..1272dcf6ac9b --- /dev/null +++ b/projects/miopen/common_utils/include/common_utils/par_for.hpp @@ -0,0 +1,149 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#ifndef MIOPEN_GUARD_MLOPEN_PAR_FOR_HPP +#define MIOPEN_GUARD_MLOPEN_PAR_FOR_HPP + +#include +#include +#include +#include +#include +#include + +#include + +namespace miopen { + +struct joinable_thread : std::thread +{ + template + joinable_thread(Xs&&... xs) : std::thread(std::forward(xs)...) // NOLINT + { + } + + joinable_thread& operator=(joinable_thread&& other) = default; + joinable_thread(joinable_thread&& other) = default; + + ~joinable_thread() + { + if(this->joinable()) + this->join(); + } +}; + +struct thread_factory +{ + template + joinable_thread operator()(std::size_t& work, std::size_t n, std::size_t grainsize, F f) const + { + auto result = joinable_thread([=] { + std::size_t start = work; + std::size_t last = std::min(n, work + grainsize); + for(std::size_t i = start; i < last; i++) + { + f(i); + } + }); + work += grainsize; + return result; + } +}; + +template +void par_for_impl(std::size_t n, std::size_t threadsize, F f) +{ + if(threadsize <= 1) + { + for(std::size_t i = 0; i < n; i++) + f(i); + } + else + { + std::vector threads(threadsize); + const std::size_t grainsize = std::ceil(static_cast(n) / threads.size()); + + std::size_t work = 0; + std::generate(threads.begin(), + threads.end(), + std::bind(thread_factory{}, std::ref(work), n, grainsize, f)); + assert(work >= n); + } +} + +template +void par_for(std::size_t n, std::size_t min_grain, F f) +{ + const auto threadsize = + std::min(std::thread::hardware_concurrency(), n / min_grain); + par_for_impl(n, threadsize, f); +} + +struct min_grain +{ + std::size_t n = 0; +}; + +template +void par_for(std::size_t n, min_grain mg, F f) +{ + const auto threadsize = std::min(std::thread::hardware_concurrency(), n / mg.n); + par_for_impl(n, threadsize, f); +} + +template +void par_for(std::size_t n, F f) +{ + par_for(n, min_grain{8}, f); +} + +struct max_threads +{ + std::size_t n = 0; +}; + +template +void par_for(std::size_t n, max_threads mt, F f) +{ + const auto threadsize = std::min(std::thread::hardware_concurrency(), mt.n); + par_for_impl(n, std::min(threadsize, n), f); +} + +template +void par_for_strided(std::size_t n, max_threads mt, F f) +{ + auto threadsize = std::min(std::thread::hardware_concurrency(), mt.n); + par_for_impl(threadsize, threadsize, [&](auto start) { + for(std::size_t i = start; i < n; i += threadsize) + { + f(i); + } + }); +} + +} // namespace miopen + +#endif diff --git a/projects/miopen/common_utils/include/common_utils/random.hpp b/projects/miopen/common_utils/include/common_utils/random.hpp new file mode 100644 index 000000000000..f6f8d85c4ce4 --- /dev/null +++ b/projects/miopen/common_utils/include/common_utils/random.hpp @@ -0,0 +1,159 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2025 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#ifndef GUARD_RANDOM_GEN_ +#define GUARD_RANDOM_GEN_ + +#include + +#include +#include +#include + +MIOPEN_DECLARE_ENV_VAR_UINT64(MIOPEN_DEBUG_DRIVER_PRNG_SEED, 12345678) + +namespace env = miopen::env; + +namespace prng { +namespace details { +using glibc_gen = std::linear_congruential_engine; + +inline std::random_device::result_type get_default_seed() +{ + static std::random_device::result_type seed{[] { + auto external_seed = env::value(MIOPEN_DEBUG_DRIVER_PRNG_SEED); + + auto seed_ = external_seed == 0 + ? std::random_device{}() + : static_cast(external_seed); + std::cout << "PRNG seed: " << seed_ << "\n"; + return seed_; + }()}; + return seed; +} + +inline glibc_gen& get_prng() +{ + static thread_local glibc_gen gen{get_default_seed()}; + return gen; +} + +template +struct has_digits : std::false_type +{ +}; + +template +struct has_digits::digits)>> : std::true_type +{ +}; + +} // namespace details + +inline void reset_seed(std::random_device::result_type seed = 0) +{ + details::get_prng().seed(seed + details::get_default_seed()); +} + +// similar to std::generate_canonical, but simpler and faster +template +inline T gen_canonical() +{ + if constexpr(std::is_floating_point_v) // native fp + { + static constexpr T range = + static_cast(1) / + static_cast(details::glibc_gen::max() - details::glibc_gen::min() + 1); + return range * static_cast(details::get_prng()() - details::glibc_gen::min()); + } + else if constexpr(std::is_integral_v) + { + auto val = details::get_prng()(); + return static_cast(((val >> 4) + (val >> 16)) & 0x1); + } + else + { + return static_cast(gen_canonical()); + } +} + +template +inline T gen_0_to_B(T B) +{ + if constexpr(std::is_floating_point_v) // native fp + { + return gen_canonical() * B; + } + else if constexpr(std::is_integral_v) + { + // can only generate 27bit range, so it may not be suitable + // for huge 64 bit ranges, but we do not expect such ranges + return static_cast((details::get_prng()() >> 4) % B); + } + else // half/bfloat/etc + { + return static_cast(gen_0_to_B(static_cast(B))); + } +} + +template +inline T gen_A_to_B(T A, T B) +{ + assert(B > A); + return gen_0_to_B(B - A) + A; +} + +template +inline T gen_off_range(T offset, T range) +{ + static_assert(std::is_integral_v); + return prng::gen_0_to_B(range) + offset; +} + +template +inline T gen_subnorm() +{ + T denorm_val = static_cast(0); + if constexpr(!std::is_integral_v && !std::is_same_v && + std::is_trivially_copyable::value && details::has_digits::value) + { + using BitType = std::conditional_t>; + static_assert(sizeof(T) == sizeof(BitType)); + + // -1 because ::digits counts the first implicit digit + static constexpr auto mantissa_bits = std::numeric_limits::digits - 1; + + BitType denorm_bits = static_cast(gen_0_to_B(1 << mantissa_bits)); + denorm_bits |= Signed ? (gen_canonical() << (sizeof(T) * 8 - 1)) : 0; + + // the proper way to do a type punning + std::memcpy(&denorm_val, &denorm_bits, sizeof(T)); + } + return denorm_val; +} +} // namespace prng +#endif // GUARD_RANDOM_GEN_ diff --git a/projects/miopen/common_utils/include/common_utils/rank.hpp b/projects/miopen/common_utils/include/common_utils/rank.hpp new file mode 100644 index 000000000000..013ec6e7f7f4 --- /dev/null +++ b/projects/miopen/common_utils/include/common_utils/rank.hpp @@ -0,0 +1,42 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2017 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#ifndef GUARD_MIOPEN_RANK_HPP +#define GUARD_MIOPEN_RANK_HPP + +namespace miopen { + +template +struct rank : rank +{ +}; + +template <> +struct rank<0> +{ +}; +} // namespace miopen + +#endif diff --git a/projects/miopen/common_utils/include/common_utils/reduce_common.hpp b/projects/miopen/common_utils/include/common_utils/reduce_common.hpp new file mode 100644 index 000000000000..74ce541f694b --- /dev/null +++ b/projects/miopen/common_utils/include/common_utils/reduce_common.hpp @@ -0,0 +1,66 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#ifndef GUARD_MIOPEN_REDUCE_COMMON_HPP +#define GUARD_MIOPEN_REDUCE_COMMON_HPP + +#include +#include + +namespace reduce { + +template +static inline Tdst convert_type(Tsrc x) +{ + return static_cast(x); +} + +template <> +inline float convert_type(half_float::half x) +{ + return half_float::half_cast(x); +}; + +template <> +inline half_float::half convert_type(float x) +{ + return half_float::half_cast(x); +}; + +template <> +inline float convert_type(bfloat16 x) +{ + return float(x); +}; + +template <> +inline bfloat16 convert_type(float x) +{ + return bfloat16(x); +}; + +}; // end of namespace reduce + +#endif diff --git a/projects/miopen/common_utils/include/common_utils/returns.hpp b/projects/miopen/common_utils/include/common_utils/returns.hpp new file mode 100644 index 000000000000..4fdb1db18b87 --- /dev/null +++ b/projects/miopen/common_utils/include/common_utils/returns.hpp @@ -0,0 +1,38 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2017 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#ifndef GUARD_MIOPEN_RETURNS_HPP +#define GUARD_MIOPEN_RETURNS_HPP + +#define MIOPEN_RETURNS(...) \ + ->decltype(__VA_ARGS__) { return __VA_ARGS__; } + +#define MIOPEN_BODY_RETURNS(...) \ + { \ + return __VA_ARGS__; \ + } + +#endif diff --git a/projects/miopen/common_utils/include/common_utils/stringutils.hpp b/projects/miopen/common_utils/include/common_utils/stringutils.hpp new file mode 100644 index 000000000000..19d579014c73 --- /dev/null +++ b/projects/miopen/common_utils/include/common_utils/stringutils.hpp @@ -0,0 +1,165 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2017 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#ifndef GUARD_MIOPEN_STRINGUTILS_HPP +#define GUARD_MIOPEN_STRINGUTILS_HPP + +#include +#include +#include +#include +#include +#include +#include +#include + +#define MIOPEN_STRINGIZE_1(...) #__VA_ARGS__ +#define MIOPEN_STRINGIZE(...) MIOPEN_STRINGIZE_1(__VA_ARGS__) + +namespace miopen { + +inline std::string +ReplaceString(const std::string& in, const std::string& search, const std::string& replace) +{ + size_t pos = 0; + std::string subject(in); + while((pos = subject.find(search, pos)) != std::string::npos) + { + subject.replace(pos, search.length(), replace); + pos += replace.length(); + } + return subject; +} + +inline bool EndsWith(const std::string& value, const std::string& suffix) +{ + if(suffix.size() > value.size()) + return false; + else + return std::equal(suffix.rbegin(), suffix.rend(), value.rbegin()); +} + +template +inline std::string JoinStrings(Strings strings, std::string delim) +{ + auto it = strings.begin(); + if(it == strings.end()) + return ""; + + auto nit = std::next(it); + return std::accumulate( + nit, strings.end(), *it, [&](std::string x, std::string y) { return x + delim + y; }); +} + +template +static inline std::string TransformString(std::string s, F f) +{ + std::transform(s.begin(), s.end(), s.begin(), f); + return s; +} + +inline std::string ToUpper(std::string s) { return TransformString(std::move(s), ::toupper); } + +inline bool StartsWith(const std::string& value, const std::string& prefix) +{ + if(prefix.size() > value.size()) + return false; + else + return std::equal(prefix.begin(), prefix.end(), value.begin()); +} + +inline std::string RemovePrefix(std::string s, std::string prefix) +{ + if(StartsWith(s, prefix)) + return s.substr(prefix.length()); + else + return s; +} + +inline std::vector SplitSpaceSeparated(const std::string& in) +{ + std::istringstream ss(in); + const std::istream_iterator begin(ss), end; + return {begin, end}; +} + +inline std::vector SplitSpaceSeparated(const std::vector& in) +{ + std::vector rv; + for(const auto& item : in) + { + if(item.find(' ') != std::string::npos) + { + const auto splitted = SplitSpaceSeparated(item); + std::copy(splitted.begin(), splitted.end(), std::back_inserter(rv)); + } + else + { + rv.emplace_back(item); + } + } + return rv; +} + +inline std::vector SplitSpaceSeparated(const std::string& in, + const std::vector& dontSplitAfter) +{ + std::vector rv; + std::istringstream ss(in); + std::string s; + while(ss >> s) + { + if(any_of(dontSplitAfter, [&](const auto& dont) { return dont == s; })) + { + std::string s2; + if(ss >> s2) + { + s += std::string(" ").append(s2); // Exactly one space is important. + rv.push_back(s); + continue; + } + throw std::runtime_error("Error parsing string: '" + in + '\''); + } + rv.push_back(s); + } + return rv; +} + +inline std::vector SplitDelim(const std::string& in, const char delim) +{ + std::vector rv; + std::string token; + std::istringstream ss(in); + + while(std::getline(ss, token, delim)) + { + rv.push_back(token); + } + return rv; +} + +} // namespace miopen + +#endif // GUARD_MIOPEN_STRINGUTILS_HPP diff --git a/projects/miopen/common_utils/include/common_utils/type_name.hpp b/projects/miopen/common_utils/include/common_utils/type_name.hpp new file mode 100644 index 000000000000..ac7fd2ff6017 --- /dev/null +++ b/projects/miopen/common_utils/include/common_utils/type_name.hpp @@ -0,0 +1,139 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2017-2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#ifndef GUARD_TYPE_NAME_HPP +#define GUARD_TYPE_NAME_HPP + +#include +#include +#if defined(_MSC_VER) && !defined(__clang__) && !defined(__GNUC__) +#include +#endif + +namespace miopen { + +template +constexpr std::string_view type_name() +{ +#if defined(__clang__) || defined(__GNUC__) + // clang or gcc + constexpr auto full_name = std::string_view{__PRETTY_FUNCTION__}; +#elif defined(_MSC_VER) + // msvc + constexpr auto full_name = std::string_view{__FUNCSIG__}; +#endif + + // The substring with the data type name is located within the original string, between the + // prefix and the suffix, with the prefix always not at the beginning of the string and the + // suffix always at the end of the string. +#if defined(__clang__) + // clang + constexpr auto prefix = std::string_view{"[T = "}; + constexpr auto suffix = std::string_view{"]"}; +#elif defined(__GNUC__) + // gcc + constexpr auto prefix = std::string_view{"[with T = "}; + constexpr auto suffix = std::string_view{"; std::string_view = std::basic_string_view]"}; +#elif defined(_MSC_VER) + // msvc + constexpr auto prefix = std::string_view{"type_name<"}; + constexpr auto suffix = std::string_view{">(void)"}; +#endif + + constexpr auto prefix_pos = full_name.find(prefix); + static_assert(prefix_pos != std::string_view::npos); + + constexpr auto suffix_pos = full_name.rfind(suffix); + static_assert(suffix_pos != std::string_view::npos); + static_assert(suffix_pos == full_name.size() - suffix.size()); + + constexpr auto pos = prefix_pos + prefix.size(); + static_assert(pos < suffix_pos); + constexpr auto count = suffix_pos - pos; + + constexpr auto name = full_name.substr(pos, count); + +#if defined(__clang__) || defined(__GNUC__) + // clang or gcc + return name; +#elif defined(_MSC_VER) + // msvc + if constexpr(std::is_compound_v) + { + // For compound data types, the string contains the keyword 'class/struct/union/enum' before + // the data type name, separated by a space. + constexpr auto sep = std::string_view{" "}; + constexpr auto sep_pos = name.find(sep); + static_assert(sep_pos != std::string_view::npos); + static_assert(sep_pos != 0); // must not be at the 0 position + + constexpr auto name_pos = sep_pos + sep.size(); + constexpr auto tname = name.substr(name_pos); + static_assert(tname.size() > 0); + + return tname; + } + else + { + return name; + } +#endif +} + +template +constexpr std::string_view type_name_bare() +{ + constexpr auto name = type_name(); + constexpr auto pos = name.rfind(':'); + if constexpr(pos == std::string_view::npos) + { + constexpr auto result = name; + return result; + } + else + { + constexpr auto bare_name = name.substr(pos + 1); + static_assert(bare_name.size() > 0); + return bare_name; + } +} + +template +const std::string& get_type_name() +{ + static const auto ret = std::string(type_name()); + return ret; +} + +template +const std::string& get_type_name(const T&) +{ + return miopen::get_type_name(); +} + +} // namespace miopen + +#endif diff --git a/projects/miopen/driver/CBAInferFusion_driver.hpp b/projects/miopen/driver/CBAInferFusion_driver.hpp index 0b63f8fe5af6..8bc25e1ffc58 100644 --- a/projects/miopen/driver/CBAInferFusion_driver.hpp +++ b/projects/miopen/driver/CBAInferFusion_driver.hpp @@ -36,9 +36,9 @@ #include "util_driver.hpp" #include "conv_common.hpp" -#include "../test/verify.hpp" -#include "../test/cpu_conv.hpp" -#include "../test/cpu_bias.hpp" +#include +#include +#include #include #include diff --git a/projects/miopen/driver/CMakeLists.txt b/projects/miopen/driver/CMakeLists.txt index 4aac2358c432..835d6437b650 100644 --- a/projects/miopen/driver/CMakeLists.txt +++ b/projects/miopen/driver/CMakeLists.txt @@ -74,7 +74,7 @@ endif() add_dependencies(MIOpenDriver generate_kernels) target_include_directories(MIOpenDriver PRIVATE ../src/kernels) # MIOpen_with_plugins ensures CK plugin .so's are built alongside MIOpenDriver -target_link_libraries(MIOpenDriver PRIVATE MIOpen_with_plugins Threads::Threads roc::rocrand nlohmann_json::nlohmann_json ) +target_link_libraries(MIOpenDriver PRIVATE MIOpen_with_plugins Threads::Threads roc::rocrand nlohmann_json::nlohmann_json miopen_common_utils miopen_utils) if(NOT MIOPEN_EMBED_DB STREQUAL "") target_link_libraries(MIOpenDriver PRIVATE $ ) endif() diff --git a/projects/miopen/driver/adam_driver.hpp b/projects/miopen/driver/adam_driver.hpp index f0c0258c8241..6c1984c44e87 100644 --- a/projects/miopen/driver/adam_driver.hpp +++ b/projects/miopen/driver/adam_driver.hpp @@ -32,7 +32,7 @@ #include "tensor_driver.hpp" #include "timer.hpp" -#include "../test/verify.hpp" +#include #include #include diff --git a/projects/miopen/driver/addlayernorm_driver.hpp b/projects/miopen/driver/addlayernorm_driver.hpp index effdc90c6127..a1bac6125dfc 100644 --- a/projects/miopen/driver/addlayernorm_driver.hpp +++ b/projects/miopen/driver/addlayernorm_driver.hpp @@ -26,8 +26,8 @@ #ifndef GUARD_MIOPEN_ADDLAYERNORM_DRIVER_HPP #define GUARD_MIOPEN_ADDLAYERNORM_DRIVER_HPP -#include <../test/tensor_holder.hpp> -#include <../test/verify.hpp> +#include +#include #include "InputFlags.hpp" #include "driver.hpp" #include "random.hpp" diff --git a/projects/miopen/driver/bn_driver.hpp b/projects/miopen/driver/bn_driver.hpp index 29cdfd970356..82802f8bd965 100644 --- a/projects/miopen/driver/bn_driver.hpp +++ b/projects/miopen/driver/bn_driver.hpp @@ -35,9 +35,9 @@ #include "util_driver.hpp" #include "rocrand_wrapper.hpp" -#include "../test/verify.hpp" -#include "../test/random.hpp" -#include "../test/fusionHost.hpp" +#include +#include +#include #include #include diff --git a/projects/miopen/driver/cat_driver.hpp b/projects/miopen/driver/cat_driver.hpp index f9a675440c15..a4e6804f9aad 100644 --- a/projects/miopen/driver/cat_driver.hpp +++ b/projects/miopen/driver/cat_driver.hpp @@ -18,8 +18,8 @@ #include #include #include -#include <../test/tensor_holder.hpp> -#include <../test/verify.hpp> +#include +#include #include #ifndef MLO_CATHOST_H_ diff --git a/projects/miopen/driver/conv_driver.hpp b/projects/miopen/driver/conv_driver.hpp index fcdbdbbd2ea6..77010d71e87a 100644 --- a/projects/miopen/driver/conv_driver.hpp +++ b/projects/miopen/driver/conv_driver.hpp @@ -28,10 +28,10 @@ #include #include -#include <../test/cpu_bias.hpp> -#include <../test/cpu_conv.hpp> -#include <../test/tensor_holder.hpp> -#include <../test/verify.hpp> +#include +#include +#include +#include #include #include diff --git a/projects/miopen/driver/conv_verify.hpp b/projects/miopen/driver/conv_verify.hpp index ae315843f01e..31d611bce134 100644 --- a/projects/miopen/driver/conv_verify.hpp +++ b/projects/miopen/driver/conv_verify.hpp @@ -27,7 +27,7 @@ #define GUARD_MIOPEN_CONV_VERIFY_HPP #include -#include "../test/gemm.hpp" +#include template diff --git a/projects/miopen/driver/ctc_driver.hpp b/projects/miopen/driver/ctc_driver.hpp index 2b8e64a8f79a..85aecb3264d3 100644 --- a/projects/miopen/driver/ctc_driver.hpp +++ b/projects/miopen/driver/ctc_driver.hpp @@ -35,7 +35,7 @@ #include -#include <../test/verify.hpp> +#include #include #include diff --git a/projects/miopen/driver/driver.hpp b/projects/miopen/driver/driver.hpp index 5bb698554566..2ebbcc2a4000 100644 --- a/projects/miopen/driver/driver.hpp +++ b/projects/miopen/driver/driver.hpp @@ -39,7 +39,7 @@ #include #include #include -#include <../test/tensor_holder.hpp> +#include #include "util_driver.hpp" #include "rocrand_wrapper.hpp" using half = half_float::half; diff --git a/projects/miopen/driver/dropout_driver.hpp b/projects/miopen/driver/dropout_driver.hpp index 84d942155a08..0016340fd60e 100644 --- a/projects/miopen/driver/dropout_driver.hpp +++ b/projects/miopen/driver/dropout_driver.hpp @@ -34,7 +34,7 @@ #include "util_driver.hpp" #include "util_file.hpp" -#include <../test/verify.hpp> +#include #include #include diff --git a/projects/miopen/driver/gemm_driver.hpp b/projects/miopen/driver/gemm_driver.hpp index d89a09a56644..8383b01ec22f 100644 --- a/projects/miopen/driver/gemm_driver.hpp +++ b/projects/miopen/driver/gemm_driver.hpp @@ -34,7 +34,7 @@ #include "random.hpp" #include "util_driver.hpp" -#include <../test/verify.hpp> +#include #include #include diff --git a/projects/miopen/driver/getitem_driver.hpp b/projects/miopen/driver/getitem_driver.hpp index 52a5bc262f82..55b0dfcd296c 100644 --- a/projects/miopen/driver/getitem_driver.hpp +++ b/projects/miopen/driver/getitem_driver.hpp @@ -40,8 +40,8 @@ #include #include #include -#include <../test/tensor_holder.hpp> -#include <../test/verify.hpp> +#include +#include template int32_t mloGetitemBackwardRunHost(miopenTensorDescriptor_t dyDesc, diff --git a/projects/miopen/driver/glu_driver.hpp b/projects/miopen/driver/glu_driver.hpp index 38deb2d69e78..63bf7188db4d 100644 --- a/projects/miopen/driver/glu_driver.hpp +++ b/projects/miopen/driver/glu_driver.hpp @@ -38,7 +38,7 @@ #include #include -#include <../test/verify.hpp> +#include #include #include diff --git a/projects/miopen/driver/groupnorm_driver.hpp b/projects/miopen/driver/groupnorm_driver.hpp index 3773654c842d..97553dd3c13e 100644 --- a/projects/miopen/driver/groupnorm_driver.hpp +++ b/projects/miopen/driver/groupnorm_driver.hpp @@ -32,7 +32,7 @@ #include "mloGroupNormHost.hpp" #include "tensor_driver.hpp" #include "timer.hpp" -#include <../test/verify.hpp> +#include #include #include #include @@ -40,7 +40,7 @@ #include #include #include -#include <../test/tensor_holder.hpp> +#include #include "random.hpp" template diff --git a/projects/miopen/driver/gru_verify_gemm.hpp b/projects/miopen/driver/gru_verify_gemm.hpp index e07d6eab0bff..237d311b1c29 100644 --- a/projects/miopen/driver/gru_verify_gemm.hpp +++ b/projects/miopen/driver/gru_verify_gemm.hpp @@ -28,7 +28,7 @@ #include "dropout_gpu_emulator.hpp" -#include <../test/rnn_util.hpp> +#include #include #include diff --git a/projects/miopen/driver/kthvalue_driver.hpp b/projects/miopen/driver/kthvalue_driver.hpp index 75f7e5b535b2..8cbfa302bf14 100644 --- a/projects/miopen/driver/kthvalue_driver.hpp +++ b/projects/miopen/driver/kthvalue_driver.hpp @@ -30,8 +30,8 @@ #include "timer.hpp" #include "random.hpp" -#include <../test/tensor_holder.hpp> -#include <../test/verify.hpp> +#include +#include #include #include diff --git a/projects/miopen/driver/layernorm_driver.hpp b/projects/miopen/driver/layernorm_driver.hpp index 6f6662f202f6..042e8a7164ea 100644 --- a/projects/miopen/driver/layernorm_driver.hpp +++ b/projects/miopen/driver/layernorm_driver.hpp @@ -26,9 +26,9 @@ #ifndef GUARD_MIOPEN_LAYERNORM_DRIVER_HPP #define GUARD_MIOPEN_LAYERNORM_DRIVER_HPP -#include <../test/tensor_holder.hpp> -#include <../test/verify.hpp> -#include <../test/cpu_layernorm.hpp> +#include +#include +#include #include "InputFlags.hpp" #include "driver.hpp" #include "miopen/miopen.h" diff --git a/projects/miopen/driver/lrn_driver.hpp b/projects/miopen/driver/lrn_driver.hpp index c1645621acd4..2f164aad38b1 100644 --- a/projects/miopen/driver/lrn_driver.hpp +++ b/projects/miopen/driver/lrn_driver.hpp @@ -12,7 +12,7 @@ #include "timer.hpp" #include "util_driver.hpp" -#include "../test/verify.hpp" +#include #include #include diff --git a/projects/miopen/driver/lstm_verify_gemm.hpp b/projects/miopen/driver/lstm_verify_gemm.hpp index fb98d5616ad5..a761779738f4 100644 --- a/projects/miopen/driver/lstm_verify_gemm.hpp +++ b/projects/miopen/driver/lstm_verify_gemm.hpp @@ -28,7 +28,7 @@ #include "dropout_gpu_emulator.hpp" -#include <../test/rnn_util.hpp> +#include #include #include diff --git a/projects/miopen/driver/miopen_Reduction.hpp b/projects/miopen/driver/miopen_Reduction.hpp index 3aee4e375c97..0fc05603bf2e 100644 --- a/projects/miopen/driver/miopen_Reduction.hpp +++ b/projects/miopen/driver/miopen_Reduction.hpp @@ -31,7 +31,7 @@ #include #include -#include "../test/cpu_reduce_util.hpp" +#include #include "tensor_driver.hpp" diff --git a/projects/miopen/driver/mloSoftmaxHost.hpp b/projects/miopen/driver/mloSoftmaxHost.hpp index fd0a1768e6a6..928eb6f63490 100644 --- a/projects/miopen/driver/mloSoftmaxHost.hpp +++ b/projects/miopen/driver/mloSoftmaxHost.hpp @@ -1,350 +1,2 @@ -// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. -// SPDX-License-Identifier: MIT - -#ifndef MLO_SOFTMAXHOST_H_ -#define MLO_SOFTMAXHOST_H_ - -#include -#include - -//////////////////////////////////////////////////////////// -// -/////////////////////////////////////////////////////////// - -#define NEGATIVE_INF_FP32 (-1e20) -#define NEGATIVE_INF_FP16 (-1e5) - -template -T logaddexp(T x, T y, T neg_inf) -{ - T a = std::max(x, y); - T b = std::min(x, y); - T c = b - a; - - return c <= neg_inf ? std::max(a, neg_inf) : std::max(T(a + log(T(1) + exp(b - a))), neg_inf); -} - -template -int mloSoftmaxForwardRunHost(miopenTensorDescriptor_t inputTensor, - miopenTensorDescriptor_t outputTensor, - Tgpu* in, - Tcheck* outhost, - float alpha, - float beta, - miopenSoftmaxAlgorithm_t algo, - miopenSoftmaxMode_t mode) -{ - int n, c, h, w, in_nstr, in_cstr, in_hstr, in_wstr; - int out_nstr, out_cstr, out_hstr, out_wstr; - miopenGet4dTensorDescriptorLengths(inputTensor, &n, &c, &h, &w); - miopenGet4dTensorDescriptorStrides(inputTensor, &in_nstr, &in_cstr, &in_hstr, &in_wstr); - miopenGet4dTensorDescriptorStrides(outputTensor, &out_nstr, &out_cstr, &out_hstr, &out_wstr); - - Tcheck max_val = (sizeof(Tgpu) == 4) ? 3.402823466e+38f : 65504.; - std::vector channel_max((mode == MIOPEN_SOFTMAX_MODE_INSTANCE ? n : n * h * w), - static_cast(-max_val)); - std::vector results(n * c * h * w, static_cast(0.0)); - - int ret = 0; - - if(mode == MIOPEN_SOFTMAX_MODE_INSTANCE) - { - for(int i = 0; i < n; i++) - { - if(algo == MIOPEN_SOFTMAX_FAST) - { - for(int j = 0; j < c; j++) - for(int s0 = 0; s0 < h; s0++) - for(int s1 = 0; s1 < w; s1++) - { - results[(i * c + j) * h * w + s0 * w + s1] = static_cast( - in[i * in_nstr + j * in_cstr + s0 * in_hstr + s1 * in_wstr]); - } - } - else - { - for(int j = 0; j < c; j++) - for(int s0 = 0; s0 < h; s0++) - for(int s1 = 0; s1 < w; s1++) - { - channel_max[i] = std::max( - static_cast( - in[i * in_nstr + j * in_cstr + s0 * in_hstr + s1 * in_wstr]), - channel_max[i]); - } - - for(int j = 0; j < c; j++) - for(int s0 = 0; s0 < h; s0++) - for(int s1 = 0; s1 < w; s1++) - { - results[(i * c + j) * h * w + s0 * w + s1] = - static_cast( - in[i * in_nstr + j * in_cstr + s0 * in_hstr + s1 * in_wstr]) - - channel_max[i]; - } - } - - if(algo == MIOPEN_SOFTMAX_LOG) - { - Tcheck neg_inf = static_cast( - miopen::deref(inputTensor).GetType() == miopenHalf ? NEGATIVE_INF_FP16 - : NEGATIVE_INF_FP32); - channel_max[i] = neg_inf; - for(int j = 0; j < c; j++) - for(int s0 = 0; s0 < h; s0++) - for(int s1 = 0; s1 < w; s1++) - { - channel_max[i] = logaddexp(results[(i * c + j) * h * w + s0 * w + s1], - channel_max[i], - neg_inf); - } - - for(int j = 0; j < c; j++) - for(int s0 = 0; s0 < h; s0++) - for(int s1 = 0; s1 < w; s1++) - { - outhost[i * out_nstr + j * out_cstr + s0 * out_hstr + s1 * out_wstr] = - alpha * - (results[(i * c + j) * h * w + s0 * w + s1] - channel_max[i]) + - beta * outhost[i * out_nstr + j * out_cstr + s0 * out_hstr + - s1 * out_wstr]; - } - } - else - { - channel_max[i] = 0.0; - for(int j = 0; j < c; j++) - for(int s0 = 0; s0 < h; s0++) - for(int s1 = 0; s1 < w; s1++) - { - results[(i * c + j) * h * w + s0 * w + s1] = - exp(results[(i * c + j) * h * w + s0 * w + s1]); - channel_max[i] += results[(i * c + j) * h * w + s0 * w + s1]; - } - - for(int j = 0; j < c; j++) - for(int s0 = 0; s0 < h; s0++) - for(int s1 = 0; s1 < w; s1++) - { - outhost[i * out_nstr + j * out_cstr + s0 * out_hstr + s1 * out_wstr] = - alpha * - (results[(i * c + j) * h * w + s0 * w + s1] / channel_max[i]) + - beta * outhost[i * out_nstr + j * out_cstr + s0 * out_hstr + - s1 * out_wstr]; - } - } - } - } - else - { - for(int i = 0; i < n; i++) - { - for(int s0 = 0; s0 < h; s0++) - for(int s1 = 0; s1 < w; s1++) - { - if(algo == MIOPEN_SOFTMAX_FAST) - { - for(int j = 0; j < c; j++) - { - results[(i * c + j) * h * w + s0 * w + s1] = static_cast( - in[i * in_nstr + j * in_cstr + s0 * in_hstr + s1 * in_wstr]); - } - } - else - { - for(int j = 0; j < c; j++) - { - channel_max[i * h * w + s0 * w + s1] = std::max( - static_cast( - in[i * in_nstr + j * in_cstr + s0 * in_hstr + s1 * in_wstr]), - channel_max[i * h * w + s0 * w + s1]); - } - - for(int j = 0; j < c; j++) - { - results[(i * c + j) * h * w + s0 * w + s1] = - static_cast( - in[i * in_nstr + j * in_cstr + s0 * in_hstr + s1 * in_wstr]) - - channel_max[i * h * w + s0 * w + s1]; - } - } - - if(algo == MIOPEN_SOFTMAX_LOG) - { - Tcheck neg_inf = static_cast( - miopen::deref(inputTensor).GetType() == miopenHalf ? NEGATIVE_INF_FP16 - : NEGATIVE_INF_FP32); - channel_max[i * h * w + s0 * w + s1] = results[i * c * h * w + s0 * w + s1]; - for(int j = 1; j < c; j++) - { - channel_max[i * h * w + s0 * w + s1] = - logaddexp(results[(i * c + j) * h * w + s0 * w + s1], - channel_max[i * h * w + s0 * w + s1], - neg_inf); - } - - for(int j = 0; j < c; j++) - { - outhost[i * out_nstr + j * out_cstr + s0 * out_hstr + s1 * out_wstr] = - alpha * (results[(i * c + j) * h * w + s0 * w + s1] - - channel_max[i * h * w + s0 * w + s1]) + - beta * outhost[i * out_nstr + j * out_cstr + s0 * out_hstr + - s1 * out_wstr]; - } - } - else - { - channel_max[i * h * w + s0 * w + s1] = 0.0; - for(int j = 0; j < c; j++) - { - results[(i * c + j) * h * w + s0 * w + s1] = - exp(results[(i * c + j) * h * w + s0 * w + s1]); - channel_max[i * h * w + s0 * w + s1] += - results[(i * c + j) * h * w + s0 * w + s1]; - } - - for(int j = 0; j < c; j++) - { - outhost[i * out_nstr + j * out_cstr + s0 * out_hstr + s1 * out_wstr] = - alpha * (results[(i * c + j) * h * w + s0 * w + s1] / - channel_max[i * h * w + s0 * w + s1]) + - beta * outhost[i * out_nstr + j * out_cstr + s0 * out_hstr + - s1 * out_wstr]; - } - } - } - } - } - - return ret; -} - -template -int mloSoftmaxBackwardRunHost(miopenTensorDescriptor_t dInputTensor, - miopenTensorDescriptor_t dOutputTensor, - Tgpu* out, - Tgpu* dout, - Tcheck* dinhost, - float alpha, - float beta, - miopenSoftmaxAlgorithm_t algo, - miopenSoftmaxMode_t mode) -{ - int n, c, h, w, in_nstr, in_cstr, in_hstr, in_wstr; - int out_nstr, out_cstr, out_hstr, out_wstr; - miopenGet4dTensorDescriptorLengths(dOutputTensor, &n, &c, &h, &w); - miopenGet4dTensorDescriptorStrides(dInputTensor, &in_nstr, &in_cstr, &in_hstr, &in_wstr); - miopenGet4dTensorDescriptorStrides(dOutputTensor, &out_nstr, &out_cstr, &out_hstr, &out_wstr); - - std::vector channel_dot((mode == MIOPEN_SOFTMAX_MODE_INSTANCE ? n : n * h * w), - static_cast(0.0)); - std::vector results(n * c * h * w, static_cast(0.0)); - - int ret = 0; - - for(int i = 0; i < n; i++) - { - if(mode == MIOPEN_SOFTMAX_MODE_INSTANCE) - { - for(int j = 0; j < c; j++) - for(int s0 = 0; s0 < h; s0++) - for(int s1 = 0; s1 < w; s1++) - { - if(algo == MIOPEN_SOFTMAX_LOG) - { - channel_dot[i] += static_cast( - dout[i * out_nstr + j * out_cstr + s0 * out_hstr + s1 * out_wstr]); - } - else - { - channel_dot[i] += - static_cast(out[i * out_nstr + j * out_cstr + - s0 * out_hstr + s1 * out_wstr]) * - static_cast(dout[i * out_nstr + j * out_cstr + - s0 * out_hstr + s1 * out_wstr]); - } - } - - for(int j = 0; j < c; j++) - for(int s0 = 0; s0 < h; s0++) - for(int s1 = 0; s1 < w; s1++) - { - if(algo == MIOPEN_SOFTMAX_LOG) - { - results[(i * c + j) * h * w + s0 * w + s1] = - static_cast(dout[i * out_nstr + j * out_cstr + - s0 * out_hstr + s1 * out_wstr]) - - channel_dot[i] * std::exp(out[i * out_nstr + j * out_cstr + - s0 * out_hstr + s1 * out_wstr]); - } - else - { - results[(i * c + j) * h * w + s0 * w + s1] = - static_cast(dout[i * out_nstr + j * out_cstr + - s0 * out_hstr + s1 * out_wstr]) - - channel_dot[i]; - - results[(i * c + j) * h * w + s0 * w + s1] *= static_cast( - out[i * out_nstr + j * out_cstr + s0 * out_hstr + s1 * out_wstr]); - } - dinhost[i * in_nstr + j * in_cstr + s0 * in_hstr + s1 * in_wstr] = - alpha * results[(i * c + j) * h * w + s0 * w + s1] + - beta * dinhost[i * in_nstr + j * in_cstr + s0 * in_hstr + s1 * in_wstr]; - } - } - else - { - for(int s0 = 0; s0 < h; s0++) - for(int s1 = 0; s1 < w; s1++) - { - for(int j = 0; j < c; j++) - { - if(algo == MIOPEN_SOFTMAX_LOG) - { - channel_dot[i * h * w + s0 * w + s1] += static_cast( - dout[i * out_nstr + j * out_cstr + s0 * out_hstr + s1 * out_wstr]); - } - else - { - channel_dot[i * h * w + s0 * w + s1] += - static_cast(out[i * out_nstr + j * out_cstr + - s0 * out_hstr + s1 * out_wstr]) * - static_cast(dout[i * out_nstr + j * out_cstr + - s0 * out_hstr + s1 * out_wstr]); - } - } - - for(int j = 0; j < c; j++) - { - if(algo == MIOPEN_SOFTMAX_LOG) - { - results[(i * c + j) * h * w + s0 * w + s1] = - static_cast(dout[i * out_nstr + j * out_cstr + - s0 * out_hstr + s1 * out_wstr]) - - channel_dot[i * h * w + s0 * w + s1] * - std::exp(out[i * out_nstr + j * out_cstr + s0 * out_hstr + - s1 * out_wstr]); - } - else - { - results[(i * c + j) * h * w + s0 * w + s1] = - static_cast(dout[i * out_nstr + j * out_cstr + - s0 * out_hstr + s1 * out_wstr]) - - channel_dot[i * h * w + s0 * w + s1]; - - results[(i * c + j) * h * w + s0 * w + s1] *= static_cast( - out[i * out_nstr + j * out_cstr + s0 * out_hstr + s1 * out_wstr]); - } - dinhost[i * in_nstr + j * in_cstr + s0 * in_hstr + s1 * in_wstr] = - alpha * results[(i * c + j) * h * w + s0 * w + s1] + - beta * dinhost[i * in_nstr + j * in_cstr + s0 * in_hstr + s1 * in_wstr]; - } - } - } - } - - return ret; -} - -#endif +// Forwarding header — implementation moved to miopen_utils. +#include diff --git a/projects/miopen/driver/multimarginloss_driver.hpp b/projects/miopen/driver/multimarginloss_driver.hpp index dab040ef3ef3..5d2a60db4507 100644 --- a/projects/miopen/driver/multimarginloss_driver.hpp +++ b/projects/miopen/driver/multimarginloss_driver.hpp @@ -36,8 +36,8 @@ #include #include #include -#include <../test/tensor_holder.hpp> -#include <../test/verify.hpp> +#include +#include #include template diff --git a/projects/miopen/driver/prelu_driver.hpp b/projects/miopen/driver/prelu_driver.hpp index 761f97cc64eb..cab2eb811885 100644 --- a/projects/miopen/driver/prelu_driver.hpp +++ b/projects/miopen/driver/prelu_driver.hpp @@ -31,7 +31,7 @@ #include "tensor_driver.hpp" #include "timer.hpp" -#include <../test/verify.hpp> +#include #include diff --git a/projects/miopen/driver/random.hpp b/projects/miopen/driver/random.hpp index f6f8d85c4ce4..30be9387d99c 100644 --- a/projects/miopen/driver/random.hpp +++ b/projects/miopen/driver/random.hpp @@ -1,159 +1,2 @@ -/******************************************************************************* - * - * MIT License - * - * Copyright (c) 2025 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - *******************************************************************************/ -#ifndef GUARD_RANDOM_GEN_ -#define GUARD_RANDOM_GEN_ - -#include - -#include -#include -#include - -MIOPEN_DECLARE_ENV_VAR_UINT64(MIOPEN_DEBUG_DRIVER_PRNG_SEED, 12345678) - -namespace env = miopen::env; - -namespace prng { -namespace details { -using glibc_gen = std::linear_congruential_engine; - -inline std::random_device::result_type get_default_seed() -{ - static std::random_device::result_type seed{[] { - auto external_seed = env::value(MIOPEN_DEBUG_DRIVER_PRNG_SEED); - - auto seed_ = external_seed == 0 - ? std::random_device{}() - : static_cast(external_seed); - std::cout << "PRNG seed: " << seed_ << "\n"; - return seed_; - }()}; - return seed; -} - -inline glibc_gen& get_prng() -{ - static thread_local glibc_gen gen{get_default_seed()}; - return gen; -} - -template -struct has_digits : std::false_type -{ -}; - -template -struct has_digits::digits)>> : std::true_type -{ -}; - -} // namespace details - -inline void reset_seed(std::random_device::result_type seed = 0) -{ - details::get_prng().seed(seed + details::get_default_seed()); -} - -// similar to std::generate_canonical, but simpler and faster -template -inline T gen_canonical() -{ - if constexpr(std::is_floating_point_v) // native fp - { - static constexpr T range = - static_cast(1) / - static_cast(details::glibc_gen::max() - details::glibc_gen::min() + 1); - return range * static_cast(details::get_prng()() - details::glibc_gen::min()); - } - else if constexpr(std::is_integral_v) - { - auto val = details::get_prng()(); - return static_cast(((val >> 4) + (val >> 16)) & 0x1); - } - else - { - return static_cast(gen_canonical()); - } -} - -template -inline T gen_0_to_B(T B) -{ - if constexpr(std::is_floating_point_v) // native fp - { - return gen_canonical() * B; - } - else if constexpr(std::is_integral_v) - { - // can only generate 27bit range, so it may not be suitable - // for huge 64 bit ranges, but we do not expect such ranges - return static_cast((details::get_prng()() >> 4) % B); - } - else // half/bfloat/etc - { - return static_cast(gen_0_to_B(static_cast(B))); - } -} - -template -inline T gen_A_to_B(T A, T B) -{ - assert(B > A); - return gen_0_to_B(B - A) + A; -} - -template -inline T gen_off_range(T offset, T range) -{ - static_assert(std::is_integral_v); - return prng::gen_0_to_B(range) + offset; -} - -template -inline T gen_subnorm() -{ - T denorm_val = static_cast(0); - if constexpr(!std::is_integral_v && !std::is_same_v && - std::is_trivially_copyable::value && details::has_digits::value) - { - using BitType = std::conditional_t>; - static_assert(sizeof(T) == sizeof(BitType)); - - // -1 because ::digits counts the first implicit digit - static constexpr auto mantissa_bits = std::numeric_limits::digits - 1; - - BitType denorm_bits = static_cast(gen_0_to_B(1 << mantissa_bits)); - denorm_bits |= Signed ? (gen_canonical() << (sizeof(T) * 8 - 1)) : 0; - - // the proper way to do a type punning - std::memcpy(&denorm_val, &denorm_bits, sizeof(T)); - } - return denorm_val; -} -} // namespace prng -#endif // GUARD_RANDOM_GEN_ +// Forwarding header — implementation moved to common_utils. +#include diff --git a/projects/miopen/driver/reduce_driver.hpp b/projects/miopen/driver/reduce_driver.hpp index ab1c50e806f1..6300fa32a690 100644 --- a/projects/miopen/driver/reduce_driver.hpp +++ b/projects/miopen/driver/reduce_driver.hpp @@ -35,7 +35,7 @@ #include "util_driver.hpp" #include "util_file.hpp" -#include "../test/verify.hpp" +#include #include #include diff --git a/projects/miopen/driver/reducecalculation_driver.hpp b/projects/miopen/driver/reducecalculation_driver.hpp index 200196950997..738fb6032f3c 100644 --- a/projects/miopen/driver/reducecalculation_driver.hpp +++ b/projects/miopen/driver/reducecalculation_driver.hpp @@ -40,8 +40,8 @@ #include #include #include -#include <../test/tensor_holder.hpp> -#include <../test/verify.hpp> +#include +#include #include "../src/kernels/MIOpenReduceCalculation.hpp" #ifndef MLO_REDUCE_CALCULATIONMHOST_H_ diff --git a/projects/miopen/driver/reduceextreme_driver.hpp b/projects/miopen/driver/reduceextreme_driver.hpp index a06f5288a164..b2caf5dda398 100644 --- a/projects/miopen/driver/reduceextreme_driver.hpp +++ b/projects/miopen/driver/reduceextreme_driver.hpp @@ -39,8 +39,8 @@ #include #include #include -#include <../test/tensor_holder.hpp> -#include <../test/verify.hpp> +#include +#include #include "../src/kernels/MIOpenReduceExtreme.hpp" template diff --git a/projects/miopen/driver/rnn_driver.hpp b/projects/miopen/driver/rnn_driver.hpp index 4cd47739f5ea..7f35be320155 100644 --- a/projects/miopen/driver/rnn_driver.hpp +++ b/projects/miopen/driver/rnn_driver.hpp @@ -36,7 +36,7 @@ #include "util_driver.hpp" #include "util_file.hpp" -#include <../test/verify.hpp> +#include #include #include diff --git a/projects/miopen/driver/rnn_seq_driver.hpp b/projects/miopen/driver/rnn_seq_driver.hpp index 1ac9b23c0b4c..7babcfd00273 100644 --- a/projects/miopen/driver/rnn_seq_driver.hpp +++ b/projects/miopen/driver/rnn_seq_driver.hpp @@ -36,7 +36,7 @@ #include "util_driver.hpp" #include "util_file.hpp" -#include <../test/verify.hpp> +#include #include #include diff --git a/projects/miopen/driver/rnn_verify_gemm.hpp b/projects/miopen/driver/rnn_verify_gemm.hpp index b1fa42c3503b..04b73111513d 100644 --- a/projects/miopen/driver/rnn_verify_gemm.hpp +++ b/projects/miopen/driver/rnn_verify_gemm.hpp @@ -28,7 +28,7 @@ #include "dropout_gpu_emulator.hpp" -#include <../test/rnn_util.hpp> +#include #include #include diff --git a/projects/miopen/driver/rope_driver.hpp b/projects/miopen/driver/rope_driver.hpp index bbad2370bf4e..27f0a03126ac 100644 --- a/projects/miopen/driver/rope_driver.hpp +++ b/projects/miopen/driver/rope_driver.hpp @@ -39,8 +39,8 @@ #include #include #include -#include <../test/tensor_holder.hpp> -#include <../test/verify.hpp> +#include +#include template int32_t mloRoPEForwardRunHost(miopenTensorDescriptor_t xDesc, diff --git a/projects/miopen/driver/softmarginloss_driver.hpp b/projects/miopen/driver/softmarginloss_driver.hpp index 3a6b095eaa0e..6589abd88db9 100644 --- a/projects/miopen/driver/softmarginloss_driver.hpp +++ b/projects/miopen/driver/softmarginloss_driver.hpp @@ -35,8 +35,8 @@ #include #include #include -#include <../test/tensor_holder.hpp> -#include <../test/verify.hpp> +#include +#include #include template diff --git a/projects/miopen/driver/softmax_driver.hpp b/projects/miopen/driver/softmax_driver.hpp index e147191b2deb..52f42fdfd5f8 100644 --- a/projects/miopen/driver/softmax_driver.hpp +++ b/projects/miopen/driver/softmax_driver.hpp @@ -11,7 +11,7 @@ #include "timer.hpp" #include "util_driver.hpp" -#include <../test/verify.hpp> +#include #include #include diff --git a/projects/miopen/driver/t5layernorm_driver.hpp b/projects/miopen/driver/t5layernorm_driver.hpp index c8517ad525d8..b57fe456403f 100644 --- a/projects/miopen/driver/t5layernorm_driver.hpp +++ b/projects/miopen/driver/t5layernorm_driver.hpp @@ -26,8 +26,8 @@ #ifndef GUARD_MIOPEN_T5LAYERNORM_DRIVER_HPP #define GUARD_MIOPEN_T5LAYERNORM_DRIVER_HPP -#include <../test/tensor_holder.hpp> -#include <../test/verify.hpp> +#include +#include #include "InputFlags.hpp" #include "driver.hpp" #include "random.hpp" diff --git a/projects/miopen/driver/transformers_adam_w_driver.hpp b/projects/miopen/driver/transformers_adam_w_driver.hpp index dfd82a3284c6..a1cd81f2eb53 100644 --- a/projects/miopen/driver/transformers_adam_w_driver.hpp +++ b/projects/miopen/driver/transformers_adam_w_driver.hpp @@ -32,7 +32,7 @@ #include "tensor_driver.hpp" #include "timer.hpp" -#include "../test/verify.hpp" +#include #include #include diff --git a/projects/miopen/miopen_utils/CMakeLists.txt b/projects/miopen/miopen_utils/CMakeLists.txt new file mode 100644 index 000000000000..e93a717d0a0e --- /dev/null +++ b/projects/miopen/miopen_utils/CMakeLists.txt @@ -0,0 +1,40 @@ +################################################################################ +# +# MIT License +# +# Copyright (c) 2025 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +################################################################################ + +# INTERNAL BUILD-ONLY library — not installed, not exported, not part of the public MIOpen API. +# Shared verification/test utilities for MIOpenDriver and tests. +# Depends on common_utils and the MIOpen public API (miopen.h). +# Do NOT add install(TARGETS miopen_utils ...) — headers live in the build tree only. + +add_library(miopen_utils INTERFACE) +set_target_properties(miopen_utils PROPERTIES EXCLUDE_FROM_ALL TRUE) + +target_include_directories(miopen_utils INTERFACE + # BUILD_INTERFACE only — no install interface; these headers are not installed. + $ +) + +target_link_libraries(miopen_utils INTERFACE miopen_common_utils) diff --git a/projects/miopen/miopen_utils/include/miopen_utils/cpu_bias.hpp b/projects/miopen/miopen_utils/include/miopen_utils/cpu_bias.hpp new file mode 100644 index 000000000000..0125ca37d298 --- /dev/null +++ b/projects/miopen/miopen_utils/include/miopen_utils/cpu_bias.hpp @@ -0,0 +1,140 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2019 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#ifndef GUARD_CPU_BIAS_HPP +#define GUARD_CPU_BIAS_HPP + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +template +void cpu_bias_forward_impl(tensor& out, const tensor& bias) +{ + assert(out.desc.GetNumDims() == NSpatialDim + 2 and bias.desc.GetNumDims() == NSpatialDim + 2); + assert( + bias.desc.GetLengths()[0] == 1 && bias.desc.GetLengths()[1] == out.desc.GetLengths()[1] && + std::all_of(bias.desc.GetLengths().begin() + 2, bias.desc.GetLengths().end(), [](auto v) { + return v == 1; + })); + + out.par_for_each([&](auto out_n_id, auto out_k_id, auto... out_spatial_id_pack) { + out(out_n_id, out_k_id, out_spatial_id_pack...) = + double(out(out_n_id, out_k_id, out_spatial_id_pack...)) + double(bias.data[out_k_id]); + }); +} + +template +void cpu_bias_backward_data_impl(const tensor& out, tensor& bias) +{ + assert(out.desc.GetNumDims() == NSpatialDim + 2 and bias.desc.GetNumDims() == NSpatialDim + 2); + assert( + bias.desc.GetLengths()[0] == 1 && bias.desc.GetLengths()[1] == out.desc.GetLengths()[0] && + std::all_of(bias.desc.GetLengths().begin() + 2, bias.desc.GetLengths().end(), [](auto v) { + return v == 1; + })); + + std::size_t out_n_len = out.desc.GetLengths()[0]; + std::size_t out_k_len = out.desc.GetLengths()[1]; + + std::array out_spatial_len{}; + std::copy_n(out.desc.GetLengths().begin() + 2, NSpatialDim, out_spatial_len.begin()); + + miopen::par_ford(out_k_len)([&](auto out_k_id) { + auto ford_out_n_spatial = + miopen::unpacker(miopen::prepender(miopen::ford, out_n_len))(out_spatial_len); + + double acc = 0; + ford_out_n_spatial([&](auto out_n_id, auto... out_spatial_id_pack) { + acc += double(out(out_n_id, out_k_id, out_spatial_id_pack...)); + }); + + bias.data[out_k_id] = acc; + }); +} + +template +void cpu_bias_forward(tensor& out, const tensor& bias) +{ + switch(out.desc.GetNumDims()) + { + case 3: { + cpu_bias_forward_impl<1>(out, bias); + break; + } + case 4: { + cpu_bias_forward_impl<2>(out, bias); + break; + } + case 5: { + cpu_bias_forward_impl<3>(out, bias); + break; + } + case 6: { + cpu_bias_forward_impl<4>(out, bias); + break; + } + default: { + MIOPEN_THROW("not belong to any case"); + } + } +} + +template +void cpu_bias_backward_data(const tensor& out, tensor& bias) +{ + switch(out.desc.GetNumDims()) + { + case 3: { + cpu_bias_backward_data_impl<1>(out, bias); + break; + } + case 4: { + cpu_bias_backward_data_impl<2>(out, bias); + break; + } + case 5: { + cpu_bias_backward_data_impl<3>(out, bias); + break; + } + case 6: { + cpu_bias_backward_data_impl<4>(out, bias); + break; + } + default: { + MIOPEN_THROW("not belong to any case"); + } + } +} +#endif diff --git a/projects/miopen/miopen_utils/include/miopen_utils/cpu_conv.hpp b/projects/miopen/miopen_utils/include/miopen_utils/cpu_conv.hpp new file mode 100644 index 000000000000..2ef2c5b31236 --- /dev/null +++ b/projects/miopen/miopen_utils/include/miopen_utils/cpu_conv.hpp @@ -0,0 +1,514 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2019 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#ifndef GUARD_CPU_CONV_HPP +#define GUARD_CPU_CONV_HPP + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +template +static constexpr auto make_array(T x, Ts... xs) +{ + return std::array{{x, xs...}}; +} + +template +struct PassThru +{ + T operator()(T t) { return t; } +}; + +template +struct cpu_convolution_acc_type +{ + using type = double; // default using double as accumulator +}; + +template <> +struct cpu_convolution_acc_type +{ + using type = int32_t; +}; + +template <> +struct cpu_convolution_acc_type +{ + using type = double; +}; + +template +void cpu_convolution_forward_impl(const tensor& in, + const tensor& wei, + tensor& out, + const Range& pads, + const Range& strides, + const Range& dilations, + std::size_t group_count, + FI fi = {}, + FW fw = {}) +{ + static_assert(ConvDim > 0, "wrong! convolution dim should be larger than 0"); + assert(in.desc.GetNumDims() == ConvDim + 2 and wei.desc.GetNumDims() == ConvDim + 2 and + out.desc.GetNumDims() == ConvDim + 2 and pads.size() == ConvDim and + strides.size() == ConvDim and dilations.size() == ConvDim); + std::size_t out_n_len = out.desc.GetLengths()[0]; + + std::size_t wei_k_len = wei.desc.GetLengths()[0]; + std::size_t wei_c_len = wei.desc.GetLengths()[1]; + + std::size_t vector_len = in.desc.GetVectorLength(); + + std::array in_spatial_len{}; + std::array wei_spatial_len{}; + std::array out_spatial_len{}; + + std::copy_n(in.desc.GetLengths().begin() + 2, ConvDim, in_spatial_len.begin()); + std::copy_n(wei.desc.GetLengths().begin() + 2, ConvDim, wei_spatial_len.begin()); + std::copy_n(out.desc.GetLengths().begin() + 2, ConvDim, out_spatial_len.begin()); + + if(wei.desc.GetLayout_str() == "CHWNc") + { + wei_c_len = wei.desc.GetLengths()[0]; + std::copy_n(wei.desc.GetLengths().begin() + 1, ConvDim, wei_spatial_len.begin()); + wei_k_len = wei.desc.GetLengths()[3]; + } + + std::size_t wei_k_len_per_group = wei_k_len / group_count; + + // f(x0, x1, xs...) + // f1(xs...) = f(x0, x1, xs...) + // f2(xs_array) = f1(xs...) + auto par_ford_out_nk_spatial = miopen::unpacker( + miopen::prepender(miopen::par_ford, out_n_len, wei_k_len))(out_spatial_len); + + par_ford_out_nk_spatial([&](std::size_t out_n_id, + std::size_t out_k_id, + auto... out_spatial_id_pack) { + auto out_spatial_id = make_array(out_spatial_id_pack...); + + std::size_t group_id = out_k_id / wei_k_len_per_group; + Tacc acc = 0; + + miopen::ford(wei_c_len)([&](std::size_t wei_c_id) { + std::size_t in_c_id = group_id * wei_c_len + wei_c_id; + + auto ford_wei_spatial = miopen::unpacker(miopen::ford)(wei_spatial_len); + + ford_wei_spatial([&](auto... wei_spatial_id_pack) { + auto wei_spatial_id = make_array(wei_spatial_id_pack...); + + std::array in_spatial_id{}; + + for(std::size_t i = 0; i < ConvDim; ++i) + { + in_spatial_id[i] = + out_spatial_id[i] * strides[i] + wei_spatial_id[i] * dilations[i] - pads[i]; + } + bool out_of_bound = false; + for(std::size_t i = 0; i < ConvDim; ++i) + { + out_of_bound = out_of_bound or + (in_spatial_id[i] < 0 or in_spatial_id[i] >= in_spatial_len[i]); + } + if(!out_of_bound) + { + if(vector_len > 1) + { + std::array in_id{}; + in_id[1] = out_n_id; + in_id[2] = in_c_id; + std::copy_n(in_spatial_id.begin(), ConvDim, in_id.begin() + 3); + for(std::size_t i = 0; i < vector_len; i++) + { + in_id[0] = i; + acc += Tacc(in(in_id)) * + Tacc(wei(i, out_k_id, wei_c_id, wei_spatial_id_pack...)); + } + } + else + { + std::array in_id{}; + in_id[0] = out_n_id; + in_id[1] = in_c_id; + std::copy_n(in_spatial_id.begin(), ConvDim, in_id.begin() + 2); + Tacc tmp1 = static_cast(fi(in(in_id))); + Tacc tmp2 = + static_cast(fw(wei(out_k_id, wei_c_id, wei_spatial_id_pack...))); + acc += tmp1 * tmp2; + } + } + }); + }); + if(vector_len > 1) + { + out(out_k_id % vector_len, out_n_id, out_k_id / vector_len, out_spatial_id_pack...) = + static_cast(acc); + } + else + { + out(out_n_id, out_k_id, out_spatial_id_pack...) = static_cast(acc); + } + }); +} + +template +void cpu_convolution_backward_data_impl(tensor& in, + const tensor& wei, + const tensor& out, + const Range& pads, + const Range& strides, + const Range& dilations, + std::size_t group_count, + FW fw = {}, + FO fo = {}) +{ + static_assert(ConvDim > 0, "wrong! convolution dim should be larger than 0"); + assert(in.desc.GetNumDims() == ConvDim + 2 and wei.desc.GetNumDims() == ConvDim + 2 and + out.desc.GetNumDims() == ConvDim + 2 and pads.size() == ConvDim and + strides.size() == ConvDim and dilations.size() == ConvDim); + + std::size_t in_n_len = in.desc.GetLengths()[0]; + std::size_t in_c_len = in.desc.GetLengths()[1]; + + std::size_t wei_k_len = wei.desc.GetLengths()[0]; + std::size_t wei_c_len = wei.desc.GetLengths()[1]; + + std::size_t wei_k_len_per_group = wei_k_len / group_count; + + std::array in_spatial_len{}; + std::array wei_spatial_len{}; + std::array out_spatial_len{}; + + std::copy_n(in.desc.GetLengths().begin() + 2, ConvDim, in_spatial_len.begin()); + std::copy_n(wei.desc.GetLengths().begin() + 2, ConvDim, wei_spatial_len.begin()); + std::copy_n(out.desc.GetLengths().begin() + 2, ConvDim, out_spatial_len.begin()); + + auto par_ford_in_nc_spatial = + miopen::unpacker(miopen::prepender(miopen::par_ford, in_n_len, in_c_len))(in_spatial_len); + + par_ford_in_nc_spatial( + [&](std::size_t in_n_id, std::size_t in_c_id, auto... in_spatial_id_pack) { + auto in_spatial_id = make_array(in_spatial_id_pack...); + + std::size_t group_id = in_c_id / wei_c_len; + + Tacc acc = 0; + + miopen::ford(wei_k_len_per_group)([&](std::size_t wei_k_id_inside_group) { + auto ford_wei_spatial = miopen::unpacker(miopen::ford)(wei_spatial_len); + + ford_wei_spatial([&](auto... wei_spatial_id_pack) { + auto wei_spatial_id = make_array(wei_spatial_id_pack...); + + std::array out_spatial_id_{}; + std::array out_spatial_id{}; + + for(std::size_t i = 0; i < ConvDim; ++i) + { + out_spatial_id_[i] = + pads[i] + in_spatial_id[i] - wei_spatial_id[i] * dilations[i]; + out_spatial_id[i] = out_spatial_id_[i] / strides[i]; + } + + bool use = true; + for(std::size_t i = 0; i < ConvDim; ++i) + { + use &= out_spatial_id_[i] % strides[i] == 0 and out_spatial_id[i] >= 0 and + out_spatial_id[i] < out_spatial_len[i]; + } + + if(use) + { + std::size_t out_k_id = + group_id * wei_k_len_per_group + wei_k_id_inside_group; + std::size_t wei_c_id = in_c_id % wei_c_len; + + std::array out_id{}; + out_id[0] = in_n_id; + out_id[1] = out_k_id; + std::copy_n(out_spatial_id.begin(), ConvDim, out_id.begin() + 2); + Tacc tmp1 = fo(out(out_id)); + Tacc tmp2 = fw(wei(out_k_id, wei_c_id, wei_spatial_id_pack...)); + acc += tmp1 * tmp2; + } + }); + }); + // TODO: Why do we need a no-lint here ? + in(in_n_id, in_c_id, in_spatial_id_pack...) = static_cast(acc); // NOLINT + }); +} + +template +void cpu_convolution_backward_weight_impl(const tensor& in, + tensor& wei, + const tensor& out, + const Range& pads, + const Range& strides, + const Range& dilations, + std::size_t group_count, + FI fi, + FO fo) +{ + static_assert(ConvDim > 0, "wrong! convolution dim should be larger than 0"); + assert(in.desc.GetNumDims() == ConvDim + 2 and wei.desc.GetNumDims() == ConvDim + 2 and + out.desc.GetNumDims() == ConvDim + 2 and pads.size() == ConvDim and + strides.size() == ConvDim and dilations.size() == ConvDim); + + std::size_t out_n_len = out.desc.GetLengths()[0]; + + std::size_t wei_k_len = wei.desc.GetLengths()[0]; + std::size_t wei_c_len = wei.desc.GetLengths()[1]; + + std::size_t wei_k_len_per_group = wei_k_len / group_count; + + std::array in_spatial_len{}; + std::array wei_spatial_len{}; + std::array out_spatial_len{}; + + std::copy_n(in.desc.GetLengths().begin() + 2, ConvDim, in_spatial_len.begin()); + std::copy_n(wei.desc.GetLengths().begin() + 2, ConvDim, wei_spatial_len.begin()); + std::copy_n(out.desc.GetLengths().begin() + 2, ConvDim, out_spatial_len.begin()); + + auto par_ford_wei_kc_spatial = miopen::unpacker( + miopen::prepender(miopen::par_ford, wei_k_len, wei_c_len))(wei_spatial_len); + + par_ford_wei_kc_spatial( + [&](std::size_t wei_k_id, std::size_t wei_c_id, auto... wei_spatial_id_pack) { + auto wei_spatial_id = make_array(wei_spatial_id_pack...); + + std::size_t group_id = wei_k_id / wei_k_len_per_group; + std::size_t in_c_id = group_id * wei_c_len + wei_c_id; + + Tacc acc = 0; + + miopen::ford(out_n_len)([&](std::size_t out_n_id) { + auto ford_out_spatial = miopen::unpacker(miopen::ford)(out_spatial_len); + + ford_out_spatial([&](auto... out_spatial_id_pack) { + auto out_spatial_id = make_array(out_spatial_id_pack...); + + std::array in_spatial_id{}; + + for(std::size_t i = 0; i < ConvDim; ++i) + { + in_spatial_id[i] = out_spatial_id[i] * strides[i] + + wei_spatial_id[i] * dilations[i] - pads[i]; + } + + bool out_of_bound = false; + for(std::size_t i = 0; i < ConvDim; ++i) + { + out_of_bound = out_of_bound or (in_spatial_id[i] < 0 or + in_spatial_id[i] >= in_spatial_len[i]); + } + + if(!out_of_bound) + { + std::array in_id{}; + in_id[0] = out_n_id; + in_id[1] = in_c_id; + std::copy_n(in_spatial_id.begin(), ConvDim, in_id.begin() + 2); + Tacc tmp1 = fi(in(in_id)); + Tacc tmp2 = fo(out(out_n_id, wei_k_id, out_spatial_id_pack...)); + acc += tmp1 * tmp2; + } + }); + + wei(wei_k_id, wei_c_id, wei_spatial_id_pack...) = static_cast(acc); + }); + }); +} + +template , + typename FW = PassThru> +void cpu_convolution_forward(std::size_t spatial_dim, + const tensor& in, + const tensor& wei, + tensor& out, + const Range& pads, + const Range& strides, + const Range& dilations, + std::size_t group_count, + FI fi = {}, + FW fw = {}) +{ + switch(spatial_dim) + { + case 1: { + cpu_convolution_forward_impl<1, Tacc>( + in, wei, out, pads, strides, dilations, group_count, fi, fw); + break; + } + case 2: { + cpu_convolution_forward_impl<2, Tacc>( + in, wei, out, pads, strides, dilations, group_count, fi, fw); + break; + } + case 3: { + cpu_convolution_forward_impl<3, Tacc>( + in, wei, out, pads, strides, dilations, group_count, fi, fw); + break; + } + case 4: { + cpu_convolution_forward_impl<4, Tacc>( + in, wei, out, pads, strides, dilations, group_count, fi, fw); + break; + } + default: { + MIOPEN_THROW("not belong to any case"); + } + } +} + +template , + typename FO = PassThru> +void cpu_convolution_backward_data(std::size_t spatial_dim, + tensor& in, + const tensor& wei, + const tensor& out, + const Range& pads, + const Range& strides, + const Range& dilations, + std::size_t group_count, + FW fw = {}, + FO fo = {}) +{ + switch(spatial_dim) + { + case 1: { + cpu_convolution_backward_data_impl<1, Tacc>( + in, wei, out, pads, strides, dilations, group_count, fw, fo); + break; + } + case 2: { + cpu_convolution_backward_data_impl<2, Tacc>( + in, wei, out, pads, strides, dilations, group_count, fw, fo); + break; + } + case 3: { + cpu_convolution_backward_data_impl<3, Tacc>( + in, wei, out, pads, strides, dilations, group_count, fw, fo); + break; + } + case 4: { + cpu_convolution_backward_data_impl<4, Tacc>( + in, wei, out, pads, strides, dilations, group_count, fw, fo); + break; + } + default: { + MIOPEN_THROW("not belong to any case"); + } + } +} + +template , + typename FO = PassThru> +void cpu_convolution_backward_weight(std::size_t spatial_dim, + const tensor& in, + tensor& wei, + const tensor& out, + const Range& pads, + const Range& strides, + const Range& dilations, + std::size_t group_count, + FI fi = {}, + FO fo = {}) +{ + switch(spatial_dim) + { + case 1: { + cpu_convolution_backward_weight_impl<1, Tacc>( + in, wei, out, pads, strides, dilations, group_count, fi, fo); + break; + } + case 2: { + cpu_convolution_backward_weight_impl<2, Tacc>( + in, wei, out, pads, strides, dilations, group_count, fi, fo); + break; + } + case 3: { + cpu_convolution_backward_weight_impl<3, Tacc>( + in, wei, out, pads, strides, dilations, group_count, fi, fo); + break; + } + case 4: { + cpu_convolution_backward_weight_impl<4, Tacc>( + in, wei, out, pads, strides, dilations, group_count, fi, fo); + break; + } + default: { + MIOPEN_THROW("not belong to any case"); + } + } +} +#endif diff --git a/projects/miopen/miopen_utils/include/miopen_utils/cpu_layernorm.hpp b/projects/miopen/miopen_utils/include/miopen_utils/cpu_layernorm.hpp new file mode 100644 index 000000000000..0a6ab5556865 --- /dev/null +++ b/projects/miopen/miopen_utils/include/miopen_utils/cpu_layernorm.hpp @@ -0,0 +1,216 @@ +// Copyright © Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT +#ifndef GUARD_CPU_CONV_HPP +#define GUARD_CPU_CONV_HPP + +#include + +template +void cpu_layernorm_forward(tensor input, + tensor weight, + tensor bias, + tensor& ref_output, + tensor& ref_mean, + tensor& ref_rstd, + float eps, + int32_t dim, + miopenNormMode_t mode, + bool use_multithread = false) +{ + auto layout = input.desc.GetLayoutEnum(); + size_t stride = 1; + if(dim > 1 && layout.has_value() && + (layout.value() == miopenTensorNHWC || layout.value() == miopenTensorNDHWC)) + { + stride = input.desc.GetLengths()[1]; // stride = C + } + + auto dims = input.desc.GetLengths(); + size_t outer_size = 1; + size_t inner_size = 1; + for(size_t i = 0; i < dims.size(); ++i) + { + if(i < dim) + { + if(!(stride > 1 && i == 1)) + { + outer_size *= dims[i]; + } + } + else + { + inner_size *= dims[i]; + } + } + + size_t min_grain = use_multithread ? 8 : outer_size; + miopen::par_for(outer_size, min_grain, [&](int32_t o) { + miopen::ford(stride)([&](int32_t s) { + double mean_v = 0.0; + double var_v = 0.0; + + miopen::ford(inner_size)([&](int32_t i) { + double tmp = static_cast(input[o * inner_size * stride + i * stride + s]); + mean_v += tmp; + var_v += tmp * tmp; + }); + + mean_v = mean_v / inner_size; + var_v = var_v / inner_size - mean_v * mean_v; + double rstd_v = 1.0 / sqrt(var_v + eps); + + ref_mean[o * stride + s] = static_cast(mean_v); + ref_rstd[o * stride + s] = static_cast(rstd_v); + + miopen::ford(inner_size)([&](int32_t i) { + double weight_v = + (mode == MIOPEN_ELEMENTWISE_AFFINE) ? 1.0 : static_cast(weight[i]); + double bias_v = + (mode == MIOPEN_ELEMENTWISE_AFFINE) ? 0.0 : static_cast(bias[i]); + + ref_output[o * inner_size * stride + i * stride + s] = static_cast( + (static_cast(input[o * inner_size * stride + i * stride + s]) - + mean_v) * + rstd_v * weight_v + + bias_v); + }); + }); + }); +} + +template +void cpu_layernorm_backward(tensor dy, + tensor x, + tensor weight, + tensor mean, + tensor rstd, + tensor& ref_dx, + int32_t dim, + miopenNormMode_t mode, + bool use_multithread = false) +{ + auto layout = dy.desc.GetLayoutEnum(); + size_t stride = 1; + if(dim > 1 && (layout == miopenTensorNHWC || layout == miopenTensorNDHWC)) + { + stride = dy.desc.GetLengths()[1]; // stride = C + } + + auto dims = dy.desc.GetLengths(); + size_t outer_size = 1; + size_t inner_size = 1; + for(size_t i = 0; i < dims.size(); ++i) + { + if(i < dim) + { + if(!(stride > 1 && i == 1)) + { + outer_size *= dims[i]; + } + } + else + { + inner_size *= dims[i]; + } + } + + size_t min_grain = use_multithread ? 8 : outer_size; + miopen::par_for(outer_size, min_grain, [&](int32_t o) { + miopen::ford(stride)([&](int32_t s) { + double sum_dy_weight = 0.0; + double sum_dy_weight_x = 0.0; + + miopen::ford(inner_size)([&](int32_t i) { + double pweight = + (mode == MIOPEN_ELEMENTWISE_AFFINE) ? 1.0 : static_cast(weight[i]); + double pdy = (dy.GetSize() != 0) + ? static_cast(dy[o * inner_size * stride + i * stride + s]) + : 0.0; + double px = static_cast(x[o * inner_size * stride + i * stride + s]); + + sum_dy_weight += pdy * pweight; + sum_dy_weight_x += pdy * px * pweight; + }); + + double scale = 1.0 / static_cast(inner_size); + double prstd = static_cast(rstd[o * stride + s]); + double pmean = static_cast(mean[o * stride + s]); + double a = prstd * prstd * prstd * scale * (sum_dy_weight_x - sum_dy_weight * pmean); + double b = prstd * sum_dy_weight * scale - a * pmean; + + miopen::ford(inner_size)([&](int32_t i) { + double pweight = + (mode == MIOPEN_ELEMENTWISE_AFFINE) ? 1.0 : static_cast(weight[i]); + double pdy = (dy.GetSize() != 0) + ? static_cast(dy[o * inner_size * stride + i * stride + s]) + : 0.0; + double val = prstd * pdy * pweight - + a * static_cast(x[o * inner_size * stride + i * stride + s]) - + b; + + ref_dx[o * inner_size * stride + i * stride + s] = static_cast(val); + }); + }); + }); +} + +template +void cpu_layernorm_backward_weight_bias(tensor dy, + tensor x, + tensor mean, + tensor rstd, + tensor& ref_dw, + tensor& ref_db, + int32_t dim, + bool use_multithread = false) +{ + auto layout = dy.desc.GetLayoutEnum(); + size_t stride = 1; + if(dim > 1 && (layout == miopenTensorNHWC || layout == miopenTensorNDHWC)) + { + stride = dy.desc.GetLengths()[1]; // stride = C + } + + auto dims = dy.desc.GetLengths(); + size_t outer_size = 1; + size_t inner_size = 1; + for(size_t i = 0; i < dims.size(); ++i) + { + if(i < dim) + { + if(!(stride > 1 && i == 1)) + { + outer_size *= dims[i]; + } + } + else + { + inner_size *= dims[i]; + } + } + + size_t min_grain = use_multithread ? 8 : inner_size; + miopen::par_for(inner_size, min_grain, [&](int32_t i) { + double sum_dw = 0.0; + double sum_db = 0.0; + + miopen::ford(stride)([&](int32_t s) { + miopen::ford(outer_size)([&](int32_t o) { + double prstd = static_cast(rstd[o * stride + s]); + double pmean = static_cast(mean[o * stride + s]); + double pdy = (dy.GetSize() != 0) + ? static_cast(dy[o * inner_size * stride + i * stride + s]) + : 0; + double px = static_cast(x[o * inner_size * stride + i * stride + s]); + + sum_dw += pdy * (px - pmean) * prstd; + sum_db += pdy; + }); + }); + + ref_dw[i] = sum_dw; + ref_db[i] = sum_db; + }); +} + +#endif diff --git a/projects/miopen/miopen_utils/include/miopen_utils/cpu_reduce_util.hpp b/projects/miopen/miopen_utils/include/miopen_utils/cpu_reduce_util.hpp new file mode 100644 index 000000000000..e5f7d50f9d0b --- /dev/null +++ b/projects/miopen/miopen_utils/include/miopen_utils/cpu_reduce_util.hpp @@ -0,0 +1,649 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#ifndef GUARD_CPU_REDUCE_UTIL_HPP +#define GUARD_CPU_REDUCE_UTIL_HPP + +#include "miopen/reducetensor.hpp" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace reduce { + +template +static inline bool float_equal_one(T); + +static inline bool float_equal_one(float x) { return x == 1.0f; }; + +static inline bool float_equal_one(double x) { return x == 1.0; }; + +static inline bool float_equal_one(half_float::half x) +{ + return x == convert_type(1.0f); +}; + +template +static inline bool float_equal_zero(T x); + +static inline bool float_equal_zero(float x) { return x == 0.0f; }; + +static inline bool float_equal_zero(double x) { return x == 0.0; }; + +static inline bool float_equal_zero(half_float::half x) +{ + return x == convert_type(0.0f); +}; + +template +static inline void build_radix(const std::vector& lens, std::vector& radix) +{ + const std::size_t D = lens.size(); + radix.assign(D, 1); + for(std::size_t d = D; d-- > 1;) + radix[d - 1] = radix[d] * static_cast(lens[d]); // radix[d] = Π_{k>d} lens[k] +} + +// i -> memory offset using lens-radix + actual strides +template +static inline std::size_t linear_to_offset_by_lens_strides(std::size_t i, + const std::vector& lens, + const std::vector& radix, + const std::vector& strides) +{ + std::size_t off = 0; + for(std::size_t d = 0; d < lens.size(); ++d) + { + const std::size_t idx_d = (i / radix[d]) % static_cast(lens[d]); + off += idx_d * static_cast(strides[d]); + } + return off; +} + +template +static inline std::function PreUnaryOpFn(miopenReduceTensorOp_t op_, std::size_t) +{ + using std::abs; + + switch(op_) + { + case MIOPEN_REDUCE_TENSOR_NORM1: return ([&](compType& a_) { a_ = abs(a_); }); + case MIOPEN_REDUCE_TENSOR_NORM2: return ([&](compType& a_) { a_ = a_ * a_; }); + case MIOPEN_REDUCE_TENSOR_AMAX: return ([&](compType& a_) { a_ = abs(a_); }); + + case MIOPEN_REDUCE_TENSOR_AVG: + case MIOPEN_REDUCE_TENSOR_ADD: + case MIOPEN_REDUCE_TENSOR_MUL: + case MIOPEN_REDUCE_TENSOR_MIN: + case MIOPEN_REDUCE_TENSOR_MAX: return ([&](compType&) {}); + } + + throw std::runtime_error(std::string(__FUNCTION__) + + ": using undefined Reduction operation is not permitted"); +}; + +template +static inline std::function PosUnaryOpFn(miopenReduceTensorOp_t op_, + std::size_t divider) +{ + using std::sqrt; + + switch(op_) + { + case MIOPEN_REDUCE_TENSOR_NORM2: return ([&](compType& a_) { a_ = sqrt(a_); }); + + case MIOPEN_REDUCE_TENSOR_AVG: + return ([&, divider](compType& a_) { + a_ = a_ / convert_type(static_cast(divider)); + }); + + case MIOPEN_REDUCE_TENSOR_ADD: + case MIOPEN_REDUCE_TENSOR_NORM1: + case MIOPEN_REDUCE_TENSOR_MUL: + case MIOPEN_REDUCE_TENSOR_MIN: + case MIOPEN_REDUCE_TENSOR_MAX: + case MIOPEN_REDUCE_TENSOR_AMAX: return ([&](compType&) {}); + } + + throw std::runtime_error(std::string(__FUNCTION__) + + ": using undefined Reduction operation is not permitted"); +}; + +template +static inline std::function ReduceOpFn(miopenReduceTensorOp_t op_) +{ + switch(op_) + { + case MIOPEN_REDUCE_TENSOR_ADD: + case MIOPEN_REDUCE_TENSOR_AVG: + case MIOPEN_REDUCE_TENSOR_NORM1: + case MIOPEN_REDUCE_TENSOR_NORM2: return ([&](compType& a_, compType b_) { a_ = a_ + b_; }); + + case MIOPEN_REDUCE_TENSOR_MUL: return ([&](compType& a_, compType b_) { a_ = a_ * b_; }); + + case MIOPEN_REDUCE_TENSOR_MIN: + return ([&](compType& a_, compType b_) { + if(a_ > b_) + a_ = b_; + }); + + case MIOPEN_REDUCE_TENSOR_MAX: + case MIOPEN_REDUCE_TENSOR_AMAX: + return ([&](compType& a_, compType b_) { + if(a_ < b_) + a_ = b_; + }); + } + + throw std::runtime_error(std::string(__FUNCTION__) + + ": using undefined Reduction operation is not permitted"); +}; + +template +static inline std::function +ReduceOpFn2(miopenReduceTensorOp_t op_) +{ + switch(op_) + { + case MIOPEN_REDUCE_TENSOR_MIN: + return ([&](compType& a_, compType b_, bool& changed) { + if(a_ > b_) + { + a_ = b_; + changed = true; + } + else + { + changed = false; + } + }); + + case MIOPEN_REDUCE_TENSOR_MAX: + case MIOPEN_REDUCE_TENSOR_AMAX: + return ([&](compType& a_, compType b_, bool& changed) { + if(a_ < b_) + { + a_ = b_; + changed = true; + } + else + { + changed = false; + } + }); + + case MIOPEN_REDUCE_TENSOR_ADD: + case MIOPEN_REDUCE_TENSOR_MUL: + case MIOPEN_REDUCE_TENSOR_AVG: + case MIOPEN_REDUCE_TENSOR_NORM1: + case MIOPEN_REDUCE_TENSOR_NORM2: return (std::function{}); + }; + + throw std::runtime_error(std::string(__FUNCTION__) + + ": using undefined Reduction operation is not permitted"); +}; + +template +static inline compType ReduceOpZeroVal(miopenReduceTensorOp_t op_) +{ + switch(op_) + { + case MIOPEN_REDUCE_TENSOR_ADD: + case MIOPEN_REDUCE_TENSOR_AVG: + case MIOPEN_REDUCE_TENSOR_NORM1: + case MIOPEN_REDUCE_TENSOR_NORM2: return (convert_type(0.0f)); + + case MIOPEN_REDUCE_TENSOR_MUL: return (convert_type(1.0f)); + + case MIOPEN_REDUCE_TENSOR_MIN: return (std::numeric_limits::max()); + + case MIOPEN_REDUCE_TENSOR_MAX: return (std::numeric_limits::lowest()); + case MIOPEN_REDUCE_TENSOR_AMAX: return (convert_type(0.0f)); + } + + throw std::runtime_error(std::string(__FUNCTION__) + + ": using undefined Reduction operation is not permitted"); +}; + +template +static inline void binop_with_nan_check(miopenNanPropagation_t nanOpt, + reduceOpT&& opReduce, + compType& accuVal, + compType currVal) +{ + using std::isnan; + + if(nanOpt == MIOPEN_NOT_PROPAGATE_NAN) + { + opReduce(accuVal, currVal); + } + else + { + if(isnan(currVal)) + accuVal = currVal; + else + opReduce(accuVal, currVal); + }; +}; + +template +static inline void binop_with_nan_check2(miopenNanPropagation_t nanOpt, + reduceOpT&& opReduce, + compType& accuVal, + compType currVal, + int& accuIndex, + int currIndex) +{ + using std::isnan; + + if(nanOpt == MIOPEN_NOT_PROPAGATE_NAN) + { + bool changed; + + opReduce(accuVal, currVal, changed); + + if(changed) + accuIndex = currIndex; + } + else + { + if(isnan(currVal)) + { + accuVal = currVal; + accuIndex = currIndex; + } + else + { + bool changed; + + opReduce(accuVal, currVal, changed); + + if(changed) + accuIndex = currIndex; + }; + }; +}; + +}; // end of namespace reduce + +template +std::vector> get_all_indexes(const std::vector& lens) +{ + const std::size_t D = lens.size(); + assert(D > 0); + + std::size_t N = 1; + for(const auto L : lens) + N *= static_cast(L); + + std::vector> out; + out.resize(N); + for(auto& row : out) + row.resize(D); + + std::vector stride(D, 1); + for(std::size_t d = D; d-- > 1;) + stride[d - 1] = stride[d] * static_cast(lens[d]); + + for(std::size_t r = 0; r < N; ++r) + { + for(std::size_t d = 0; d < D; ++d) + out[r][d] = static_cast((r / stride[d]) % static_cast(lens[d])); + } + + return out; +} + +template +static inline T +linear_to_offset(size_t li, const std::vector& lens, const std::vector& strides) +{ + T off = 0; + for(int d = int(lens.size()) - 1; d >= 0; --d) + { + const T idx = li % lens[d]; + li /= lens[d]; + off += idx * strides[d]; + } + return off; +} + +template +T get_offset_from_index(const std::vector& strides, const std::vector& index) +{ + T offset = 0; + + assert(strides.size() == index.size()); + + for(int i = 0; i < index.size(); i++) + offset += strides[i] * index[i]; + + return (offset); +}; + +template +T get_flatten_offset(const std::vector& lengths, const std::vector& index) +{ + T offset = 0; + + assert(lengths.size() == index.size() && !lengths.empty()); + + int len = lengths.size(); + T stride = 1; + + // for len==1, the loop is not executed + for(int i = len - 1; i > 0; i--) + { + offset += stride * index[i]; + + stride *= lengths[i]; + }; + + offset += stride * index[0]; + + return (offset); +}; + +template +struct Reducer +{ + compType acc; + bool withIdx; + int idx; // meaningful only when WithIdx==true + miopenNanPropagation_t nanOpt; + // functors for reduction + decltype(reduce::ReduceOpFn(MIOPEN_REDUCE_TENSOR_ADD)) opNoIdx; + decltype(reduce::ReduceOpFn2(MIOPEN_REDUCE_TENSOR_ADD)) opWithIdx; + + Reducer(miopenNanPropagation_t n, miopenReduceTensorOp_t rop, compType zero, bool useIdx) + : acc(zero), + withIdx(useIdx), + idx(0), + nanOpt(n), + opNoIdx(reduce::ReduceOpFn(rop)), + opWithIdx(reduce::ReduceOpFn2(rop)) + { + } + + inline void step(compType v, int flat_i) + { + if(withIdx) + reduce::binop_with_nan_check2(nanOpt, opWithIdx, acc, v, idx, flat_i); + else + reduce::binop_with_nan_check(nanOpt, opNoIdx, acc, v); + } + + inline void combine(const Reducer& other) + { + if(withIdx) + reduce::binop_with_nan_check2(nanOpt, opWithIdx, acc, other.acc, idx, other.idx); + else + reduce::binop_with_nan_check(nanOpt, opNoIdx, acc, other.acc); + } +}; + +template +std::tuple, tensor> reduce_cpu_common(const miopenReduceTensorOp_t& reduceOp, + const miopenNanPropagation_t& nanOpt, + const std::vector& inLengths, + const std::vector& outLengths, + const std::vector& input, + const std::vector& inStrides, + const std::vector& output, + const std::vector& outStrides, + float alpha, + float beta, + bool parallel, + bool withIdx) +{ + using reduce::convert_type; + using reduce::ReduceOpZeroVal; + + // Partition dims + std::vector invariantDims, toReduceDims; + std::vector invLens, redLens, invStrides_v, redStrides_v; + + for(int i = 0; i < static_cast(inLengths.size()); ++i) + { + if(inLengths[i] == outLengths[i]) + { + invariantDims.push_back(i); + invLens.push_back(inLengths[i]); + invStrides_v.push_back(inStrides[i]); + } + else + { + toReduceDims.push_back(i); + redLens.push_back(inLengths[i]); + redStrides_v.push_back(inStrides[i]); + } + } + + const bool reduceAllDims = invariantDims.empty(); + + // unary ops & zero vals + const compType zeroV = ReduceOpZeroVal(reduceOp); + + // divider = Π reduced dims (or N if reduce-all) + std::size_t divider = 1; + if(reduceAllDims) + divider = std::accumulate( + inLengths.begin(), inLengths.end(), std::size_t{1}, std::multiplies<>()); + else + divider = + std::accumulate(redLens.begin(), redLens.end(), std::size_t{1}, std::multiplies<>()); + + auto PreUnaryOp = reduce::PreUnaryOpFn(reduceOp, divider); + auto PosUnaryOp = reduce::PosUnaryOpFn(reduceOp, divider); + + // outputs + auto res = tensor{outLengths}; + res.data = output; + auto res_indices = tensor{outLengths}; + if(withIdx) + std::fill(res_indices.begin(), res_indices.end(), 0); + + if(reduceAllDims) + { + // Flatten whole tensor + const std::size_t N = divider; // product of all dims + std::vector lens_radix; + reduce::build_radix(inLengths, lens_radix); + + // parallel chunking + std::size_t hw = + std::max(std::size_t{1}, static_cast(std::thread::hardware_concurrency())); + const std::size_t P = std::min(N, hw * 4ul); + const std::size_t chunk = (N + P - 1) / P; + + std::vector> partial; + partial.reserve(P); + for(std::size_t p = 0; p < P; ++p) + partial.emplace_back(nanOpt, reduceOp, zeroV, withIdx); + + auto worker = [&](int p) { + const std::size_t begin = std::size_t(p) * chunk; + const std::size_t end = std::min(begin + chunk, N); + + auto& r = partial[p]; + for(std::size_t i = begin; i < end; ++i) + { + const auto off = + reduce::linear_to_offset_by_lens_strides(i, inLengths, lens_radix, inStrides); + auto v = convert_type(input[off]); + PreUnaryOp(v); + r.step(v, static_cast(i)); // flat index across whole tensor + } + }; + + if(parallel) + { + miopen::par_for(static_cast(P), worker); + } + else + { + for(int p = 0; p < P; ++p) + { + worker(p); + } + } + + // combine + Reducer R(nanOpt, reduceOp, zeroV, withIdx); + for(std::size_t p = 0; p < P; ++p) + R.combine(partial[p]); + + // post + PosUnaryOp(R.acc); + if(alpha != 1.0f) + R.acc *= convert_type(alpha); + if(beta != 0.0f) + R.acc += convert_type(output[0]) * convert_type(beta); + + res.data[0] = convert_type(R.acc); + if(withIdx) + res_indices.data[0] = R.idx; + } + else + { + // Build radices for invariant and reduced subspaces + std::vector invRad, redRad; + reduce::build_radix(invLens, invRad); + reduce::build_radix(redLens, redRad); + + const std::size_t INV = + std::accumulate(invLens.begin(), invLens.end(), std::size_t{1}, std::multiplies<>()); + const std::size_t TR = divider; + + std::size_t hw = + std::max(std::size_t{1}, static_cast(std::thread::hardware_concurrency())); + const std::size_t Te = std::min(hw * 4ul, std::max(1, INV)); + const std::size_t chunk = (INV + Te - 1) / Te; + + auto worker = [&](int t) { + const std::size_t row0 = std::size_t(t) * chunk; + const std::size_t row1 = std::min(row0 + chunk, INV); + + for(std::size_t r = row0; r < row1; ++r) + { + // decode invariant multi-index; compute base offsets + std::size_t tmp = r; + std::size_t base_in_off = 0; + std::size_t base_out_off = 0; + for(std::size_t k = 0; k < invLens.size(); ++k) + { + const std::size_t idx = (tmp / invRad[k]) % invLens[k]; + base_in_off += idx * invStrides_v[k]; + base_out_off += idx * outStrides[invariantDims[k]]; + } + + Reducer R(nanOpt, reduceOp, zeroV, withIdx); + + // iterate reduced subspace + for(std::size_t i = 0; i < TR; ++i) + { + std::size_t tmp2 = i; + std::size_t red_off = 0; + for(std::size_t k = 0; k < redLens.size(); ++k) + { + const std::size_t idx = (tmp2 / redRad[k]) % redLens[k]; + red_off += idx * redStrides_v[k]; + } + + auto v = convert_type(input[base_in_off + red_off]); + PreUnaryOp(v); + R.step(v, static_cast(i)); // flat index inside reduced subspace + } + + PosUnaryOp(R.acc); + if(alpha != 1.0f) + R.acc *= convert_type(alpha); + if(beta != 0.0f) + R.acc += + convert_type(output[base_out_off]) * convert_type(beta); + + res.data[base_out_off] = convert_type(R.acc); + if(withIdx) + res_indices.data[base_out_off] = R.idx; + } + }; + + if(parallel) + { + miopen::par_for(static_cast(Te), worker); + } + else + { + for(int te = 0; te < Te; ++te) + { + worker(te); + } + } + } + + return {res, res_indices}; +} + +template +std::tuple, tensor> +reduce_cpu_common(const miopen::ReduceTensorDescriptor& reduceDesc, + const tensor& input, + const tensor& output, + float alpha, + float beta, + bool parallel, + bool withIdx) +{ + auto inLengths = input.desc.GetLengths(); + auto outLengths = output.desc.GetLengths(); + auto inStrides = input.desc.GetStrides(); + auto outStrides = output.desc.GetStrides(); + + const auto reduceOp = reduceDesc.reduceTensorOp_; + const auto nanOpt = reduceDesc.reduceTensorNanOpt_; + + return reduce_cpu_common(reduceOp, + nanOpt, + inLengths, + outLengths, + input.data, + inStrides, + output.data, + outStrides, + alpha, + beta, + parallel, + withIdx); +} + +#endif diff --git a/projects/miopen/miopen_utils/include/miopen_utils/fusionHost.hpp b/projects/miopen/miopen_utils/include/miopen_utils/fusionHost.hpp new file mode 100644 index 000000000000..2d1d33cc898a --- /dev/null +++ b/projects/miopen/miopen_utils/include/miopen_utils/fusionHost.hpp @@ -0,0 +1,993 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2018 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#pragma once +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +template +void convHostForward(const tensor& input, + tensor& output, + const tensor& weights, + const int bias_mode, + const tensor& bias, + const miopenConvolutionDescriptor_t convDesc) +{ + + int in_n, in_c, in_h, in_w; + int in_nstride, in_cstride, in_hstride, in_wstride; + std::tie(in_n, in_c, in_h, in_w) = miopen::tien<4>(input.desc.GetLengths()); + std::tie(in_nstride, in_cstride, in_hstride, in_wstride) = + miopen::tien<4>(input.desc.GetStrides()); + + int wei_n, wei_c, wei_h, wei_w; + int wei_nstride, wei_cstride, wei_hstride, wei_wstride; + std::tie(wei_n, wei_c, wei_h, wei_w) = miopen::tien<4>(weights.desc.GetLengths()); + std::tie(wei_nstride, wei_cstride, wei_hstride, wei_wstride) = + miopen::tien<4>(weights.desc.GetStrides()); + + int out_n, out_c, out_h, out_w; + int out_nstride, out_cstride, out_hstride, out_wstride; + std::tie(out_n, out_c, out_h, out_w) = miopen::tien<4>(output.desc.GetLengths()); + std::tie(out_nstride, out_cstride, out_hstride, out_wstride) = + miopen::tien<4>(output.desc.GetStrides()); + + int stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w; + miopenConvolutionMode_t mode; + miopenPaddingMode_t pmode = miopen::deref(convDesc).paddingMode; + miopenGetConvolutionDescriptor( + convDesc, &mode, &pad_h, &pad_w, &stride_h, &stride_w, &dilation_h, &dilation_w); + + if(pmode == miopenPaddingSame) + { + pad_h = (in_h % stride_h == 0) ? (std::max((wei_h - stride_h), 0)) + : (std::max((wei_h - (in_h % stride_h)), 0)); + pad_w = (in_w % stride_w == 0) ? (std::max((wei_w - stride_w), 0)) + : (std::max((wei_w - (in_w % stride_w)), 0)); + pad_h /= 2; + pad_w /= 2; + } + else if(pmode == miopenPaddingValid) + { + pad_h = 0; + pad_w = 0; + } + + if(out_h <= 0 || out_w <= 0) + MIOPEN_THROW("Invalid Test Case: Check Output Dimension."); + + for(int o = 0; o < out_n; o++) + { // mini-batch size + for(int w = 0; w < out_c; w++) + { // out_channels (num filters) + for(int i = 0; i < out_h; i++) + { // output_height (from getforwardoutputdim()) + int in_off_h = i * stride_h; + for(int j = 0; j < out_w; j++) + { // output_width (from getforwardoutputdim()) + /*auto acc = static_cast(0.);*/ + auto acc = static_cast(0.); + int in_off_w = j * stride_w; + for(int k = 0; k < in_c; k++) + { // in_channels (RGB) + for(int x = 0; x < wei_h; x++) + { + int in_x = in_off_h - pad_h + x * dilation_h; + if(in_x >= 0 && in_x < in_h) + { + for(int y = 0; y < wei_w; y++) + { + int in_y = in_off_w - pad_w + y * dilation_w; + if(in_y >= 0 && in_y < in_w) + { + acc += double( + static_cast(input[o * in_nstride + k * in_cstride + + in_x * in_w + in_y]) * + static_cast(weights(w, k, x, y))); + } + } + } + } + } + acc = bias_mode != 0 ? acc + static_cast(bias[w]) : acc; + output[o * out_nstride + w * out_cstride + i * out_hstride + j] = + static_cast(acc); + } + } + } + } +} + +template +void batchNormSpatialHostInference(const tensor& input, + tensor& output, + const tensor& scale, + const tensor& bias, + double epsilon, + const tensor& estimatedMean, + const tensor& estimatedVariance, + bool useInverseVariance = false) +{ + + int n_batches, channels, height, width; + std::tie(n_batches, channels, height, width) = miopen::tien<4>(input.desc.GetLengths()); + miopen::par_for(channels, 1, [&](int cidx) { // via channel + V mean = estimatedMean(0, cidx, 0, 0); + V variance = estimatedVariance(0, cidx, 0, 0); + double invertVar = + useInverseVariance ? static_cast(variance) : 1.0 / sqrt(variance + epsilon); + // process the batch per channel + for(int row = 0; row < height; row++) + { // via rows + for(int column = 0; column < width; column++) + { // via columns + for(int bidx = 0; bidx < n_batches; bidx++) + { // via mini_batch + double elemStd = static_cast(input(bidx, cidx, row, column)) - mean; + double inhat = elemStd * invertVar; + output(bidx, cidx, row, column) = + static_cast(scale(0, cidx, 0, 0) * inhat + bias(0, cidx, 0, 0)); + // printf("output: %f\n",scale(0, cidx, 0, 0) * inhat + bias(0, cidx, 0, 0)); + } + } + } + }); +} + +template +void batchNormPerActivHostInference(const tensor& input, + tensor& output, + const tensor& scale, + const tensor& bias, + double epsilon, + const tensor& estimatedMean, + const tensor& estimatedVariance, + bool useInverseVariance = false) +{ + int n_batches, channels, height, width; + std::tie(n_batches, channels, height, width) = miopen::tien<4>(input.desc.GetLengths()); + miopen::par_for(channels, 1, [&](int cidx) { // via channel + for(int row = 0; row < height; row++) + { // via rows + for(int column = 0; column < width; column++) + { // via columns + // apply down the n_batch dimension + double mean = estimatedMean(0, cidx, row, column); + double variance = estimatedVariance(0, cidx, row, column); + double elemInvVar = useInverseVariance ? variance : 1.0 / sqrt(variance + epsilon); + for(int bidx = 0; bidx < n_batches; bidx++) + { // via mini_batch + // per (x-dims) channel load a block of data into LDS + double elemStd = input(bidx, cidx, row, column) - mean; + double inhat = elemStd * elemInvVar; + output(bidx, cidx, row, column) = + scale(0, cidx, row, column) * inhat + bias(0, cidx, row, column); + // printf("output: %f\n",output(bidx, cidx, row, column)); + } + } + } + }); +} + +template +void batchNormSpatialHostFwdTrain(const tensor& input, + tensor& out, + const tensor& scale, + const tensor& bias, + double epsilon, + double expAvgFactor, + tensor& saveMean, + tensor& saveInvVar, + tensor& runMean, + tensor& runVar) +{ + + int height, width, n_batch, channels; + std::tie(n_batch, channels, height, width) = miopen::tien<4>(input.desc.GetLengths()); + const auto nhw = double(height * width * n_batch); + + miopen::par_for(channels, 1, [&](int cidx) { + double elemStd = 0.; + double variance_accum = 0.; + double mean_accum = 0.; + double invVar = 0.; + double newRunMean = 0.; + double adjust = 0.; + + // process the batch per channel + for(int bidx = 0; bidx < n_batch; bidx++) + { // via mini_batch + for(int row = 0; row < height; row++) + { // via rows + for(int column = 0; column < width; column++) + { // via columns + // #1 calculate the mean + // iterating through the stack of images in the mini_batch + auto inval = static_cast(input(bidx, cidx, row, column)); + mean_accum += inval; + variance_accum += inval * inval; + } // end for (column) + } // end for (row) + } // end for (n) + + mean_accum /= nhw; + variance_accum /= nhw; + variance_accum += (-mean_accum * mean_accum); + invVar = 1.0 / sqrt(variance_accum + epsilon); + + // #4 apply the normalization + // x_hat = (x_i - mean) / sqrt(variance_accum + epsilon) + for(int bidx = 0; bidx < n_batch; bidx++) + { // via mini_batch + for(int row = 0; row < height; row++) + { // via rows + for(int column = 0; column < width; column++) + { // via columns + // #5 Gamma and Beta adjust + // y_i = gamma*x_hat + beta + elemStd = (static_cast(input(bidx, cidx, row, column)) - + mean_accum); // (x_i - mean) + out(bidx, cidx, row, column) = static_cast( + scale(0, cidx, 0, 0) * (invVar * elemStd) + bias(0, cidx, 0, 0)); + } // for (column) + } // for (row) + } // end for(n_batchs) + if(!saveMean.data.empty()) + { + saveMean(0, cidx, 0, 0) = mean_accum; + saveInvVar(0, cidx, 0, 0) = invVar; + } + if(!runMean.data.empty()) + { + newRunMean = runMean(0, cidx, 0, 0) * (1 - expAvgFactor); + runMean(0, cidx, 0, 0) = mean_accum * expAvgFactor + newRunMean; // newMean*factor + tmp + // var(n+1) = p * var(n-1) + (1 - p)*(b/b-1)*var(n) + adjust = (n_batch * height * width == 1) ? variance_accum + : (nhw / (nhw - 1)) * variance_accum; + runVar(0, cidx, 0, 0) = + (1 - expAvgFactor) * runVar(0, cidx, 0, 0) + expAvgFactor * adjust; + } + }); +} + +template +void batchNormSpatialHostBwdTrain(const tensor& x_input, + tensor& dy_input, + tensor& dx_out, + const tensor& bnScale, + const tensor& bnBias, + tensor& dscale, + tensor& dbias, + const tensor& savedMean, + const tensor& savedInvVar, + miopenActivationMode_t activ_mode, + double activ_beta, + double activ_alpha) +{ + double activ_gamma = 0.; + int height, width, n_batch, channels; + std::tie(n_batch, channels, height, width) = miopen::tien<4>(x_input.desc.GetLengths()); + auto nhw = double(height * width * n_batch); + int in_cstride = height * width; + + if(activ_mode > 0) + { + tensor input_norm = + tensor{x_input.desc.GetLayout_t(), x_input.desc.GetLengths()}; + miopen::par_for(channels, 1, [&](int cidx) { + double mean = 0.0; + double invVar = 0.0; + double elemStd = 0.; + double mean_accum = 0.0; + double variance_accum = 0.0; + if(!savedMean.data.empty()) + { + mean = static_cast(savedMean(0, cidx, 0, 0)); // HxW elements + invVar = static_cast(savedInvVar(0, cidx, 0, 0)); // HxW elements + } + else + { + for(int row = 0; row < height; row++) + { // via rows + for(int column = 0; column < width; column++) + { // via columns + for(int bidx = 0; bidx < n_batch; bidx++) + { // via mini_batch + auto inval = static_cast(x_input(bidx, cidx, row, column)); + mean_accum += inval; + variance_accum += inval * inval; + } + } + } + mean_accum /= nhw; + variance_accum /= nhw; + variance_accum += (-mean_accum * mean_accum); + mean = mean_accum; + invVar = 1.0 / sqrt(variance_accum); + } + for(int row = 0; row < height; row++) + { // via rows + for(int column = 0; column < width; column++) + { // via columns + for(int bidx = 0; bidx < n_batch; bidx++) + { // via mini_batch + elemStd = static_cast(x_input(bidx, cidx, row, column)) - + mean; // (x_i - mean) + input_norm(bidx, cidx, row, column) = static_cast( + bnScale(0, cidx, 0, 0) * (elemStd * invVar) + bnBias(0, cidx, 0, 0)); + } + } + } + }); + + activationHostBnormBwd(activ_mode, + activ_gamma, + activ_beta, + activ_alpha, + dy_input.data, + input_norm.data, + dy_input.data); + } + + miopen::par_for(channels, 1, [&](int cidx) { + double elemStd = 0.; + unsigned int xhat_index; + double mean = 0.0; + double invVar = 0.0; + double dyelem = 0.; + std::vector xhat(static_cast(n_batch) * in_cstride, 0.0); + // process the batch per channel + dscale(0, cidx, 0, 0) = 0.; + dbias(0, cidx, 0, 0) = 0.; + + if(!savedMean.data.empty()) + { + + mean = savedMean(0, cidx, 0, 0); // HxW elements + invVar = savedInvVar(0, cidx, 0, 0); // HxW elements + } + else + { + double variance_accum = 0.; + double mean_accum = 0.; + double inv_Var = 0.; + + // process the batch per channel + for(int bidx = 0; bidx < n_batch; bidx++) + { // via mini_batch + for(int row = 0; row < height; row++) + { // via rows + for(int column = 0; column < width; column++) + { // via columns + // #1 calculate the mean + // iterating through the stack of images in the mini_batch + auto inval = static_cast(x_input(bidx, cidx, row, column)); + mean_accum += inval; + variance_accum += inval * inval; + } // end for (column) + } // end for (row) + } // end for (n) + + mean_accum /= nhw; + variance_accum /= nhw; + variance_accum += (-mean_accum * mean_accum); + inv_Var = 1.0 / sqrt(variance_accum); + + mean = mean_accum; + invVar = inv_Var; + } + for(int row = 0; row < height; row++) + { // via rows + for(int column = 0; column < width; column++) + { // via columns + for(int bidx = 0; bidx < n_batch; bidx++) + { // via mini_batch + xhat_index = in_cstride * bidx + (width * row + column); + // per (x-dims) channel load a block of data into LDS + elemStd = static_cast(x_input(bidx, cidx, row, column)) - + mean; // (x_i - mean) + xhat[xhat_index] = elemStd * invVar; + dyelem = static_cast(dy_input(bidx, cidx, row, column)); + dbias(0, cidx, 0, 0) += dyelem; + dscale(0, cidx, 0, 0) += xhat[xhat_index] * dyelem; + } // end for(n_batch) + } // for (column) + } // for (row) + + for(int row = 0; row < height; row++) + { // via rows + for(int column = 0; column < width; column++) + { // via columns + for(int bidx = 0; bidx < n_batch; bidx++) + { // via mini_batch + xhat_index = in_cstride * bidx + (width * row + column); + + double tmp1 = nhw * dy_input(bidx, cidx, row, column) - dbias(0, cidx, 0, 0); + double tmp2 = -xhat[xhat_index] * dscale(0, cidx, 0, 0); + double tmp3 = (bnScale(0, cidx, 0, 0) * invVar) / nhw; + dx_out(bidx, cidx, row, column) = + static_cast(tmp3 * (tmp2 + tmp1)); + } // end for(n_batchs) + } // for (column) + } // for (row) + }); // for (channel) +} + +template +void batchNormActivSpatialHostBwdTrain(miopenActivationMode_t activMode, + double gamma, + double beta, + double alpha, + const tensor& x_input, + const tensor& dy_input, + const tensor& y_input, + tensor& dx_out, + const tensor& bnScale, + const tensor& bias, + tensor& dscale, + tensor& dbias, + const tensor& savedMean, + const tensor& savedInvVar) +{ + + int height, width, n_batch, channels; + std::tie(n_batch, channels, height, width) = miopen::tien<4>(x_input.desc.GetLengths()); + auto nhw = double(height * width * n_batch); + int in_cstride = height * width; + + miopen::par_for(channels, 1, [&](int cidx) { + double elemStd = 0.; + unsigned int xhat_index; + double mean = static_cast(savedMean(0, cidx, 0, 0)); // HxW elements + double invVar = static_cast(savedInvVar(0, cidx, 0, 0)); // HxW elements + double dyelem = 0.; + std::vector xhat(static_cast(n_batch) * in_cstride, 0.0); + // process the batch per channel + dscale(0, cidx, 0, 0) = 0.; + dbias(0, cidx, 0, 0) = 0.; + + for(int row = 0; row < height; row++) + { // via rows + for(int column = 0; column < width; column++) + { // via columns + for(int bidx = 0; bidx < n_batch; bidx++) + { // via mini_batch + + // recompute forward batch norm + xhat_index = in_cstride * bidx + (width * row + column); + // per (x-dims) channel load a block of data into LDS + elemStd = static_cast(x_input(bidx, cidx, row, column)) - + mean; // (x_i - mean) + xhat[xhat_index] = elemStd * invVar; + double bnrefowd = + bnScale(0, cidx, 0, 0) * xhat[xhat_index] + bias(0, cidx, 0, 0); + activationHostBwdElement(activMode, + gamma, + beta, + alpha, + dy_input(bidx, cidx, row, column), + bnrefowd, + y_input(bidx, cidx, row, column), + dyelem); + dbias(0, cidx, 0, 0) += dyelem; + dscale(0, cidx, 0, 0) += xhat[xhat_index] * dyelem; + } // end for(n_batch) + } // for (column) + } // for (row) + + for(int row = 0; row < height; row++) + { // via rows + for(int column = 0; column < width; column++) + { // via columns + for(int bidx = 0; bidx < n_batch; bidx++) + { // via mini_batch + xhat_index = in_cstride * bidx + (width * row + column); + double bnrefowd = + bnScale(0, cidx, 0, 0) * xhat[xhat_index] + bias(0, cidx, 0, 0); + activationHostBwdElement(activMode, + gamma, + beta, + alpha, + dy_input(bidx, cidx, row, column), + bnrefowd, + y_input(bidx, cidx, row, column), + dyelem); + // double tmp1 = nhw * dy_input(bidx, cidx, row, column) - dbias(0, cidx, 0, 0); + double tmp1 = nhw * dyelem - dbias(0, cidx, 0, 0); + double tmp2 = -xhat[xhat_index] * dscale(0, cidx, 0, 0); + double tmp3 = (bnScale(0, cidx, 0, 0) * invVar) / nhw; + dx_out(bidx, cidx, row, column) = static_cast(tmp3 * (tmp2 + tmp1)); + } // end for(n_batchs) + } // for (column) + } // for (row) + }); // for (channel) +} + +template +void batchNormPerActHostFwdTrain(const tensor& input, + tensor& out, + const tensor& scale, + const tensor& bias, + double epsilon, + double expAvgFactor, + tensor& saveMean, + tensor& saveInvVar, + tensor& runMean, + tensor& runVar) +{ + + int height, width, n_batch, channels; + std::tie(n_batch, channels, height, width) = miopen::tien<4>(input.desc.GetLengths()); + const auto n = double(n_batch); + + miopen::par_for(channels, 1, [&](int cidx) { + double mean_accum = 0.; + double variance_accum = 0.; + double elemStd = 0.; + double elemInvVar = 0.; + double inhat = 0.; + double newRunMean = 0.; + double adjust = 0.; + + // process the batch per channel + for(int row = 0; row < height; row++) + { // via rows + for(int column = 0; column < width; column++) + { // via columns + + mean_accum = 0.; + variance_accum = 0.; + for(int bidx = 0; bidx < n_batch; bidx++) + { // via mini_batch + // #1 calculate the mean :: iterating through the stack of images in the + // mini_batch + auto intval = static_cast(input(bidx, cidx, row, column)); + mean_accum += intval; + variance_accum += intval * intval; + } + mean_accum /= n; + variance_accum /= n; + variance_accum = variance_accum - (mean_accum * mean_accum); + elemInvVar = 1.0 / double(sqrt(variance_accum + epsilon)); + + // #4 apply the normalization :: x_hat = (x_i - mean) / sqrt(variance_accum - + // epsilon) + for(int bidx = 0; bidx < n_batch; bidx++) + { // via mini_batch + elemStd = (input(bidx, cidx, row, column) - mean_accum); // (x_i - mean) + inhat = elemStd * elemInvVar; + // #5 Gamma and Beta adjust :: y_i = gamma*x_hat + beta + out(bidx, cidx, row, column) = static_cast( + scale(0, cidx, row, column) * inhat + bias(0, cidx, row, column)); + } // end for(n_batch) + + if(!runMean.data.empty()) + { + newRunMean = runMean(0, cidx, row, column) * (1.0 - expAvgFactor); + runMean(0, cidx, row, column) = + mean_accum * expAvgFactor + newRunMean; // newMean*factor + tmp + } + // var(n+1) = p * var(n-1) + (1 - p)*(b/b-1)*var(n) + if(!runVar.data.empty()) + { + adjust = (n_batch == 1) ? variance_accum : (n / (n - 1.0)) * variance_accum; + runVar(0, cidx, row, column) = + (1 - expAvgFactor) * runVar(0, cidx, row, column) + expAvgFactor * adjust; + } + if(!saveMean.data.empty() || !saveInvVar.data.empty()) + { + saveMean(0, cidx, row, column) = static_cast(mean_accum); + saveInvVar(0, cidx, row, column) = static_cast(elemInvVar); + } + + } // for (column) + } // for (row) + }); +} + +template +void batchNormPerActHostBwdTrain(const tensor& x_input, + const tensor& dy_input, + tensor& dx_out, + const tensor& scale, + tensor& dscale, + tensor& dbias, + const tensor& savedMean, + const tensor& savedInvVar) +{ + + int height, width, n_batch, channels; + std::tie(n_batch, channels, height, width) = miopen::tien<4>(x_input.desc.GetLengths()); + int in_cstride = height * width; + auto n = double(n_batch); + + miopen::par_for(channels, 1, [&](int cidx) { + double elemStd = 0.; + unsigned int xhat_index; + double mean = 0.; + double elemInvVar = 0.; + double dyelem = 0.; + double dxhat = 0.; + double dxhathat = 0.; + double tmp1 = 0.; + std::vector xhat(static_cast(n_batch) * in_cstride); + + // process the batch per channel + for(int row = 0; row < height; row++) + { // via rows + for(int column = 0; column < width; column++) + { // via columns + dxhat = 0.; + dxhathat = 0.; + + if(!savedMean.data.empty()) + { + mean = savedMean(0, cidx, row, column); // HxW elements + elemInvVar = savedInvVar(0, cidx, row, column); // HxW elements + } + else + { + double variance_accum = 0.; + double mean_accum = 0.; + + // process the batch per channel + for(int bidx = 0; bidx < n_batch; bidx++) + { // via mini_batch + auto inval = static_cast(x_input(bidx, cidx, row, column)); + mean_accum += inval; + variance_accum += inval * inval; + } // end for (n) + + mean_accum /= n; + variance_accum /= n; + variance_accum += (-mean_accum * mean_accum); + + mean = mean_accum; + elemInvVar = 1.0 / sqrt(variance_accum); + } + + for(int bidx = 0; bidx < n_batch; bidx++) + { // via mini_batch + xhat_index = in_cstride * bidx + (width * row + column); + // per (x-dims) channel load a block of data into LDS + elemStd = static_cast(x_input(bidx, cidx, row, column)) - + mean; // (x_i - mean) + xhat[xhat_index] = elemStd * elemInvVar; + dyelem = static_cast(dy_input(bidx, cidx, row, column)); + dbias(0, cidx, row, column) += dyelem; + dscale(0, cidx, row, column) += xhat[xhat_index] * dyelem; + tmp1 = scale(0, cidx, row, column) * dyelem; + dxhat += tmp1; + dxhathat += tmp1 * xhat[xhat_index]; + + } // end for(n_batchs) + + for(int bidx = 0; bidx < n_batch; bidx++) + { // via mini_batch + xhat_index = in_cstride * bidx + (width * row + column); + tmp1 = xhat[xhat_index] * dxhathat + dxhat; + double tmp2 = + n_batch * scale(0, cidx, row, column) * dy_input(bidx, cidx, row, column) - + tmp1; + double tmp3 = elemInvVar / (double(n)); + dx_out(bidx, cidx, row, column) = static_cast(tmp3 * tmp2); + } // end for(n_batchs) + } // for (column) + } // for (row) + }); +} + +template +void batchNormActivPerActHostBwdTrain(miopenActivationMode_t activMode, + double gamma, + double beta, + double alpha, + const tensor& x_input, + const tensor& dy_input, + const tensor& y_input, + tensor& dx_out, + const tensor& scale, + const tensor& bias, + tensor& dscale, + tensor& dbias, + const tensor& savedMean, + const tensor& savedInvVar) +{ + + int height, width, n_batch, channels; + std::tie(n_batch, channels, height, width) = miopen::tien<4>(x_input.desc.GetLengths()); + int in_cstride = height * width; + auto n = double(n_batch); + + miopen::par_for(channels, 1, [&](int cidx) { + double elemStd = 0.; + unsigned int xhat_index; + double mean = 0.; + double elemInvVar = 0.; + double dyelem = 0.; + double dxhat = 0.; + double dxhathat = 0.; + double tmp1 = 0.; + std::vector xhat(static_cast(n_batch) * in_cstride); + + // process the batch per channel + for(int row = 0; row < height; row++) + { // via rows + for(int column = 0; column < width; column++) + { // via columns + dxhat = 0.; + dxhathat = 0.; + + mean = savedMean(0, cidx, row, column); // HxW elements + elemInvVar = savedInvVar(0, cidx, row, column); // HxW elements + + for(int bidx = 0; bidx < n_batch; bidx++) + { // via mini_batch + xhat_index = in_cstride * bidx + (width * row + column); + // per (x-dims) channel load a block of data into LDS + elemStd = static_cast(x_input(bidx, cidx, row, column)) - + mean; // (x_i - mean) + xhat[xhat_index] = elemStd * elemInvVar; + double bnrefowd = + scale(0, cidx, row, column) * xhat[xhat_index] + bias(0, cidx, row, column); + activationHostBwdElement(activMode, + gamma, + beta, + alpha, + dy_input(bidx, cidx, row, column), + bnrefowd, + y_input(bidx, cidx, row, column), + dyelem); + /*dyelem = static_cast(dy_input(bidx, cidx, row, column));*/ + dbias(0, cidx, row, column) += dyelem; + dscale(0, cidx, row, column) += xhat[xhat_index] * dyelem; + tmp1 = scale(0, cidx, row, column) * dyelem; + dxhat += tmp1; + dxhathat += tmp1 * xhat[xhat_index]; + + } // end for(n_batchs) + + for(int bidx = 0; bidx < n_batch; bidx++) + { // via mini_batch + xhat_index = in_cstride * bidx + (width * row + column); + tmp1 = xhat[xhat_index] * dxhathat + dxhat; + double bnrefowd = + scale(0, cidx, row, column) * xhat[xhat_index] + bias(0, cidx, row, column); + activationHostBwdElement(activMode, + gamma, + beta, + alpha, + dy_input(bidx, cidx, row, column), + bnrefowd, + y_input(bidx, cidx, row, column), + dyelem); + double tmp2 = (n_batch * scale(0, cidx, row, column) * dyelem) - tmp1; + double tmp3 = elemInvVar / (double(n)); + dx_out(bidx, cidx, row, column) = static_cast(tmp3 * tmp2); + } // end for(n_batchs) + } // for (column) + } // for (row) + }); +} + +template +void visitActivationHostInfer( + miopenActivationMode_t activMode, double gamma, double beta, double alpha, F f) +{ + switch(activMode) + { + case miopenActivationPASTHRU: // x + f([=](double x) { return x; }); + break; + case miopenActivationLOGISTIC: // 1 / (1 + e^-x) //Sigmoid + f([=](double x) { return (1. / (1. + std::exp(-x))); }); + break; + case miopenActivationTANH: // beta * tanh(alpha * x) + f([=](double x) { return (beta * std::tanh(alpha * x)); }); + break; + case miopenActivationRELU: // max(0, x) + f([=](double x) { return ((x > 0.) ? x : 0.); }); + break; + case miopenActivationSOFTRELU: // log(1 + e^x) // bonomial normal log likelihood + f([=](double x) { + return (x > 0.) ? (x + std::log1p(std::exp(-x))) : (std::log1p(std::exp(x))); + }); + break; + case miopenActivationABS: // abs(x) + f([=](double x) { return (std::fabs(x)); }); + break; + case miopenActivationPOWER: // (alpha + beta * x) ^ gamma + f([=](double x) { + auto v = (alpha + beta * x); + return (v <= std::numeric_limits::epsilon()) ? 0. : pow(v, gamma); + }); + break; + case miopenActivationCLIPPEDRELU: // min(alpha, max(0, x)) + f([=](double x) { return (std::min(alpha, std::max(double(0.), x))); }); + break; + case miopenActivationLEAKYRELU: // alpha * x | x<=0; x | x>0 + f([=](double x) { return ((x > 0.) ? x : x * alpha); }); + break; + case miopenActivationELU: // alpha * (exp(x)-1) | x<=0; x | x>0 + f([=](double x) { return ((x > 0.) ? x : alpha * std::expm1(x)); }); + break; + case miopenActivationCLAMP: // max(alpha, min(beta, x)) + f([=](double x) { return (std::max(alpha, std::min(beta, x))); }); + break; + // default: printf("ERROR: unknown neuron type: %d\n", activMode); break; + } +} + +template +inline void activationHostInfer(miopenActivationMode_t activMode, + double gamma, + double beta, + double alpha, + const std::vector input, + std::vector& output) +{ + visitActivationHostInfer(activMode, gamma, beta, alpha, [&](auto f) { + miopen::par_for(input.size(), 1, [&](int index) { + output[index] = static_cast(f(static_cast(input[index]))); + }); + }); +} + +template +void visitActivationHostBwd( + miopenActivationMode_t activMode, double gamma, double beta, double alpha, F f) +{ + switch(activMode) + { + case miopenActivationPASTHRU: // x + f([=](double dy, double, double) { return dy; }); + break; + case miopenActivationLOGISTIC: // 1 / (1 + e^-x) //Sigmoid + f([=](double dy, double, double y) { return dy * y * (1 - y); }); + break; + case miopenActivationTANH: // beta * tanh(alpha * x) + f([=](double dy, double, double y) { return dy * alpha * (beta - y * y / beta); }); + break; + case miopenActivationRELU: // max(0, x) + f([=](double dy, double x, double) { return (x > 0) ? dy : 0; }); + break; + case miopenActivationSOFTRELU: // log(1 + e^x) // bonomial normal log likelihood + f([=](double dy, double x, double) { + static const double threshold = 50.; + double expval = std::exp(std::min(x, threshold)); + return dy * expval / (expval + 1.0); + }); + break; + case miopenActivationABS: // abs(x) + f([=](double dy, double x, double) { return dy * ((x > 0) ? 1 : -1); }); + break; + case miopenActivationPOWER: // (alpha + beta * x) ^ gamma + f([=](double, double x, double y) { + auto v = alpha + beta * x; + return v <= std::numeric_limits::epsilon() ? 0 : gamma * beta * y / v; + }); + break; + case miopenActivationCLIPPEDRELU: // min(alpha, max(0, x)) + f([=](double dy, double x, double) { return (x > 0 && x <= alpha) ? dy : 0; }); + break; + case miopenActivationLEAKYRELU: // alpha * x | x<=0; x | x>0 + f([=](double dy, double x, double) { return dy * ((x > 0) ? 1 : alpha); }); + break; + case miopenActivationELU: // alpah * (exp(x)-1) | x<=0; x | x>0 + f([=](double dy, double x, double y) { return dy * ((x > 0) ? 1 : y + alpha); }); + break; + case miopenActivationCLAMP: // max(alpha, min(beta, x)) + f([=](double dy, double x, double) { return (x > alpha && x <= beta) ? dy : 0; }); + break; + // default: printf("ERROR: unknown neuron type: %d\n", activMode); break; + } +} + +template +inline void activationHostBnormBwd(miopenActivationMode_t activMode, + double gamma, + double beta, + double alpha, + const std::vector dyinput, + const std::vector xinput, + std::vector& output) +{ + double dummy; + visitActivationHostBwd(activMode, gamma, beta, alpha, [&](auto f) { + miopen::par_for(dyinput.size(), 1, [&](int index) { + output[index] = static_cast( + f(static_cast(dyinput[index]), static_cast(xinput[index]), dummy)); + }); + }); +} + +template +inline void activationHostBwd(miopenActivationMode_t activMode, + double gamma, + double beta, + double alpha, + const std::vector dyinput, + const std::vector xinput, + const std::vector yinput, + std::vector& output) +{ + visitActivationHostBwd(activMode, gamma, beta, alpha, [&](auto f) { + miopen::par_for(dyinput.size(), 1, [&](int index) { + output[index] = static_cast(f(static_cast(dyinput[index]), + static_cast(xinput[index]), + static_cast(yinput[index]))); + }); + }); +} + +inline void activationHostBwdElement(miopenActivationMode_t activMode, + double gamma, + double beta, + double alpha, + const double dyinput, + const double xinput, + const double yinput, + double& output) +{ + visitActivationHostBwd(activMode, gamma, beta, alpha, [&](auto f) { + output = static_cast(f(dyinput, xinput, yinput)); + }); +} + +template +tensor get_output_tensor(const miopen::ConvolutionDescriptor& filter, + const tensor& input, + const tensor& weights) +{ + return tensor{filter.GetForwardOutputTensor(input.desc, weights.desc, miopen_type{})}; +} diff --git a/projects/miopen/miopen_utils/include/miopen_utils/gemm.hpp b/projects/miopen/miopen_utils/include/miopen_utils/gemm.hpp new file mode 100644 index 000000000000..81c38db0fdf3 --- /dev/null +++ b/projects/miopen/miopen_utils/include/miopen_utils/gemm.hpp @@ -0,0 +1,120 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2017 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#ifndef GUARD_GEMM_HPP +#define GUARD_GEMM_HPP + +#include +#include +#include + +/* + A and B rows and cols should be passed as default values (NxM, MxK), independently of + a_transponse/b_transpose flag value + C rows and cols should have correct values based on a_transponse/b_transpose values + A, B, C strides should have corret values based on a_transponse/b_transpose values +*/ +template +void gemm_cpu(const Dtype* a_ptr, + const size_t a_cols, + const size_t a_rows, + const size_t a_stride, + const bool a_transpose, + const Dtype* b_ptr, + const size_t b_cols, + const size_t b_rows, + const size_t b_stride, + const bool b_transpose, + Dtype* c_ptr, + const size_t c_cols, + const size_t c_rows, + const size_t c_stride, + double alpha = 1.0, + double beta = 1.0) +{ + if((!a_transpose && !b_transpose && + ((a_cols != b_rows) || (a_rows != c_rows) || (b_cols != c_cols))) || + (a_transpose && b_transpose && + ((a_rows != b_cols) || (a_cols != c_rows) || (b_rows != c_cols))) || + (a_transpose && !b_transpose && + ((a_rows != b_rows) || (a_cols != c_rows) || (b_cols != c_cols))) || + (!a_transpose && b_transpose && + ((a_cols != b_cols) || (a_rows != c_rows) || (b_rows != c_cols)))) + { + MIOPEN_THROW("MM_CPU_ERROR. Incompatible matrix size:\nA: " + std::to_string(a_rows) + "x" + + std::to_string(a_cols) + " transpose: " + (a_transpose ? "true" : "false") + + "\nB: " + std::to_string(b_rows) + "x" + std::to_string(b_cols) + + " transpose: " + (b_transpose ? "true" : "false") + + "\nC: " + std::to_string(c_rows) + "x" + std::to_string(c_cols) + "\n"); + } + + size_t inner_loop_limit = a_transpose ? a_rows : a_cols; + auto inner_loop = [&](int m, int n) { + double el = 0.0; + if(!a_transpose && !b_transpose) + { + miopen::ford(inner_loop_limit)([&](int k) { + el += static_cast(a_ptr[m * a_stride + k]) * + static_cast(b_ptr[k * b_stride + n]); + }); + } + else if(!a_transpose && b_transpose) + { + miopen::ford(inner_loop_limit)([&](int k) { + el += static_cast(a_ptr[m * a_stride + k]) * + static_cast(b_ptr[n * b_stride + k]); + }); + } + else if(a_transpose && !b_transpose) + { + miopen::ford(inner_loop_limit)([&](int k) { + el += static_cast(a_ptr[k * a_stride + m]) * + static_cast(b_ptr[k * b_stride + n]); + }); + } + else + { + miopen::ford(inner_loop_limit)([&](int k) { + el += static_cast(a_ptr[k * a_stride + m]) * + static_cast(b_ptr[n * b_stride + k]); + }); + } + + c_ptr[m * c_stride + n] = + static_cast(beta * static_cast(c_ptr[m * c_stride + n]) + alpha * el); + }; + + constexpr size_t iter_margin = 1'048'576; // 2^20 + if(c_rows * c_cols * inner_loop_limit > iter_margin) + { + miopen::par_ford(c_rows, c_cols)(inner_loop); + } + else + { + miopen::ford(c_rows, c_cols)(inner_loop); + } +} + +#endif diff --git a/projects/miopen/miopen_utils/include/miopen_utils/gpu_mem.hpp b/projects/miopen/miopen_utils/include/miopen_utils/gpu_mem.hpp new file mode 100644 index 000000000000..ee1f52b3090d --- /dev/null +++ b/projects/miopen/miopen_utils/include/miopen_utils/gpu_mem.hpp @@ -0,0 +1,12 @@ +// Forwarding header — GPUMem is defined in driver/driver.hpp. +// This allows test code to include GPUMem without directly depending +// on the driver/ directory. The GPUMem class should eventually be +// extracted into a standalone header here. +#ifndef GUARD_MIOPEN_UTILS_GPU_MEM_HPP +#define GUARD_MIOPEN_UTILS_GPU_MEM_HPP + +// Phase 1: Forward to driver.hpp which defines GPUMem. +// Phase 2: Extract GPUMem into this file directly. +#include "../../driver/driver.hpp" + +#endif diff --git a/projects/miopen/miopen_utils/include/miopen_utils/mloSoftmaxHost.hpp b/projects/miopen/miopen_utils/include/miopen_utils/mloSoftmaxHost.hpp new file mode 100644 index 000000000000..fd0a1768e6a6 --- /dev/null +++ b/projects/miopen/miopen_utils/include/miopen_utils/mloSoftmaxHost.hpp @@ -0,0 +1,350 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + +#ifndef MLO_SOFTMAXHOST_H_ +#define MLO_SOFTMAXHOST_H_ + +#include +#include + +//////////////////////////////////////////////////////////// +// +/////////////////////////////////////////////////////////// + +#define NEGATIVE_INF_FP32 (-1e20) +#define NEGATIVE_INF_FP16 (-1e5) + +template +T logaddexp(T x, T y, T neg_inf) +{ + T a = std::max(x, y); + T b = std::min(x, y); + T c = b - a; + + return c <= neg_inf ? std::max(a, neg_inf) : std::max(T(a + log(T(1) + exp(b - a))), neg_inf); +} + +template +int mloSoftmaxForwardRunHost(miopenTensorDescriptor_t inputTensor, + miopenTensorDescriptor_t outputTensor, + Tgpu* in, + Tcheck* outhost, + float alpha, + float beta, + miopenSoftmaxAlgorithm_t algo, + miopenSoftmaxMode_t mode) +{ + int n, c, h, w, in_nstr, in_cstr, in_hstr, in_wstr; + int out_nstr, out_cstr, out_hstr, out_wstr; + miopenGet4dTensorDescriptorLengths(inputTensor, &n, &c, &h, &w); + miopenGet4dTensorDescriptorStrides(inputTensor, &in_nstr, &in_cstr, &in_hstr, &in_wstr); + miopenGet4dTensorDescriptorStrides(outputTensor, &out_nstr, &out_cstr, &out_hstr, &out_wstr); + + Tcheck max_val = (sizeof(Tgpu) == 4) ? 3.402823466e+38f : 65504.; + std::vector channel_max((mode == MIOPEN_SOFTMAX_MODE_INSTANCE ? n : n * h * w), + static_cast(-max_val)); + std::vector results(n * c * h * w, static_cast(0.0)); + + int ret = 0; + + if(mode == MIOPEN_SOFTMAX_MODE_INSTANCE) + { + for(int i = 0; i < n; i++) + { + if(algo == MIOPEN_SOFTMAX_FAST) + { + for(int j = 0; j < c; j++) + for(int s0 = 0; s0 < h; s0++) + for(int s1 = 0; s1 < w; s1++) + { + results[(i * c + j) * h * w + s0 * w + s1] = static_cast( + in[i * in_nstr + j * in_cstr + s0 * in_hstr + s1 * in_wstr]); + } + } + else + { + for(int j = 0; j < c; j++) + for(int s0 = 0; s0 < h; s0++) + for(int s1 = 0; s1 < w; s1++) + { + channel_max[i] = std::max( + static_cast( + in[i * in_nstr + j * in_cstr + s0 * in_hstr + s1 * in_wstr]), + channel_max[i]); + } + + for(int j = 0; j < c; j++) + for(int s0 = 0; s0 < h; s0++) + for(int s1 = 0; s1 < w; s1++) + { + results[(i * c + j) * h * w + s0 * w + s1] = + static_cast( + in[i * in_nstr + j * in_cstr + s0 * in_hstr + s1 * in_wstr]) - + channel_max[i]; + } + } + + if(algo == MIOPEN_SOFTMAX_LOG) + { + Tcheck neg_inf = static_cast( + miopen::deref(inputTensor).GetType() == miopenHalf ? NEGATIVE_INF_FP16 + : NEGATIVE_INF_FP32); + channel_max[i] = neg_inf; + for(int j = 0; j < c; j++) + for(int s0 = 0; s0 < h; s0++) + for(int s1 = 0; s1 < w; s1++) + { + channel_max[i] = logaddexp(results[(i * c + j) * h * w + s0 * w + s1], + channel_max[i], + neg_inf); + } + + for(int j = 0; j < c; j++) + for(int s0 = 0; s0 < h; s0++) + for(int s1 = 0; s1 < w; s1++) + { + outhost[i * out_nstr + j * out_cstr + s0 * out_hstr + s1 * out_wstr] = + alpha * + (results[(i * c + j) * h * w + s0 * w + s1] - channel_max[i]) + + beta * outhost[i * out_nstr + j * out_cstr + s0 * out_hstr + + s1 * out_wstr]; + } + } + else + { + channel_max[i] = 0.0; + for(int j = 0; j < c; j++) + for(int s0 = 0; s0 < h; s0++) + for(int s1 = 0; s1 < w; s1++) + { + results[(i * c + j) * h * w + s0 * w + s1] = + exp(results[(i * c + j) * h * w + s0 * w + s1]); + channel_max[i] += results[(i * c + j) * h * w + s0 * w + s1]; + } + + for(int j = 0; j < c; j++) + for(int s0 = 0; s0 < h; s0++) + for(int s1 = 0; s1 < w; s1++) + { + outhost[i * out_nstr + j * out_cstr + s0 * out_hstr + s1 * out_wstr] = + alpha * + (results[(i * c + j) * h * w + s0 * w + s1] / channel_max[i]) + + beta * outhost[i * out_nstr + j * out_cstr + s0 * out_hstr + + s1 * out_wstr]; + } + } + } + } + else + { + for(int i = 0; i < n; i++) + { + for(int s0 = 0; s0 < h; s0++) + for(int s1 = 0; s1 < w; s1++) + { + if(algo == MIOPEN_SOFTMAX_FAST) + { + for(int j = 0; j < c; j++) + { + results[(i * c + j) * h * w + s0 * w + s1] = static_cast( + in[i * in_nstr + j * in_cstr + s0 * in_hstr + s1 * in_wstr]); + } + } + else + { + for(int j = 0; j < c; j++) + { + channel_max[i * h * w + s0 * w + s1] = std::max( + static_cast( + in[i * in_nstr + j * in_cstr + s0 * in_hstr + s1 * in_wstr]), + channel_max[i * h * w + s0 * w + s1]); + } + + for(int j = 0; j < c; j++) + { + results[(i * c + j) * h * w + s0 * w + s1] = + static_cast( + in[i * in_nstr + j * in_cstr + s0 * in_hstr + s1 * in_wstr]) - + channel_max[i * h * w + s0 * w + s1]; + } + } + + if(algo == MIOPEN_SOFTMAX_LOG) + { + Tcheck neg_inf = static_cast( + miopen::deref(inputTensor).GetType() == miopenHalf ? NEGATIVE_INF_FP16 + : NEGATIVE_INF_FP32); + channel_max[i * h * w + s0 * w + s1] = results[i * c * h * w + s0 * w + s1]; + for(int j = 1; j < c; j++) + { + channel_max[i * h * w + s0 * w + s1] = + logaddexp(results[(i * c + j) * h * w + s0 * w + s1], + channel_max[i * h * w + s0 * w + s1], + neg_inf); + } + + for(int j = 0; j < c; j++) + { + outhost[i * out_nstr + j * out_cstr + s0 * out_hstr + s1 * out_wstr] = + alpha * (results[(i * c + j) * h * w + s0 * w + s1] - + channel_max[i * h * w + s0 * w + s1]) + + beta * outhost[i * out_nstr + j * out_cstr + s0 * out_hstr + + s1 * out_wstr]; + } + } + else + { + channel_max[i * h * w + s0 * w + s1] = 0.0; + for(int j = 0; j < c; j++) + { + results[(i * c + j) * h * w + s0 * w + s1] = + exp(results[(i * c + j) * h * w + s0 * w + s1]); + channel_max[i * h * w + s0 * w + s1] += + results[(i * c + j) * h * w + s0 * w + s1]; + } + + for(int j = 0; j < c; j++) + { + outhost[i * out_nstr + j * out_cstr + s0 * out_hstr + s1 * out_wstr] = + alpha * (results[(i * c + j) * h * w + s0 * w + s1] / + channel_max[i * h * w + s0 * w + s1]) + + beta * outhost[i * out_nstr + j * out_cstr + s0 * out_hstr + + s1 * out_wstr]; + } + } + } + } + } + + return ret; +} + +template +int mloSoftmaxBackwardRunHost(miopenTensorDescriptor_t dInputTensor, + miopenTensorDescriptor_t dOutputTensor, + Tgpu* out, + Tgpu* dout, + Tcheck* dinhost, + float alpha, + float beta, + miopenSoftmaxAlgorithm_t algo, + miopenSoftmaxMode_t mode) +{ + int n, c, h, w, in_nstr, in_cstr, in_hstr, in_wstr; + int out_nstr, out_cstr, out_hstr, out_wstr; + miopenGet4dTensorDescriptorLengths(dOutputTensor, &n, &c, &h, &w); + miopenGet4dTensorDescriptorStrides(dInputTensor, &in_nstr, &in_cstr, &in_hstr, &in_wstr); + miopenGet4dTensorDescriptorStrides(dOutputTensor, &out_nstr, &out_cstr, &out_hstr, &out_wstr); + + std::vector channel_dot((mode == MIOPEN_SOFTMAX_MODE_INSTANCE ? n : n * h * w), + static_cast(0.0)); + std::vector results(n * c * h * w, static_cast(0.0)); + + int ret = 0; + + for(int i = 0; i < n; i++) + { + if(mode == MIOPEN_SOFTMAX_MODE_INSTANCE) + { + for(int j = 0; j < c; j++) + for(int s0 = 0; s0 < h; s0++) + for(int s1 = 0; s1 < w; s1++) + { + if(algo == MIOPEN_SOFTMAX_LOG) + { + channel_dot[i] += static_cast( + dout[i * out_nstr + j * out_cstr + s0 * out_hstr + s1 * out_wstr]); + } + else + { + channel_dot[i] += + static_cast(out[i * out_nstr + j * out_cstr + + s0 * out_hstr + s1 * out_wstr]) * + static_cast(dout[i * out_nstr + j * out_cstr + + s0 * out_hstr + s1 * out_wstr]); + } + } + + for(int j = 0; j < c; j++) + for(int s0 = 0; s0 < h; s0++) + for(int s1 = 0; s1 < w; s1++) + { + if(algo == MIOPEN_SOFTMAX_LOG) + { + results[(i * c + j) * h * w + s0 * w + s1] = + static_cast(dout[i * out_nstr + j * out_cstr + + s0 * out_hstr + s1 * out_wstr]) - + channel_dot[i] * std::exp(out[i * out_nstr + j * out_cstr + + s0 * out_hstr + s1 * out_wstr]); + } + else + { + results[(i * c + j) * h * w + s0 * w + s1] = + static_cast(dout[i * out_nstr + j * out_cstr + + s0 * out_hstr + s1 * out_wstr]) - + channel_dot[i]; + + results[(i * c + j) * h * w + s0 * w + s1] *= static_cast( + out[i * out_nstr + j * out_cstr + s0 * out_hstr + s1 * out_wstr]); + } + dinhost[i * in_nstr + j * in_cstr + s0 * in_hstr + s1 * in_wstr] = + alpha * results[(i * c + j) * h * w + s0 * w + s1] + + beta * dinhost[i * in_nstr + j * in_cstr + s0 * in_hstr + s1 * in_wstr]; + } + } + else + { + for(int s0 = 0; s0 < h; s0++) + for(int s1 = 0; s1 < w; s1++) + { + for(int j = 0; j < c; j++) + { + if(algo == MIOPEN_SOFTMAX_LOG) + { + channel_dot[i * h * w + s0 * w + s1] += static_cast( + dout[i * out_nstr + j * out_cstr + s0 * out_hstr + s1 * out_wstr]); + } + else + { + channel_dot[i * h * w + s0 * w + s1] += + static_cast(out[i * out_nstr + j * out_cstr + + s0 * out_hstr + s1 * out_wstr]) * + static_cast(dout[i * out_nstr + j * out_cstr + + s0 * out_hstr + s1 * out_wstr]); + } + } + + for(int j = 0; j < c; j++) + { + if(algo == MIOPEN_SOFTMAX_LOG) + { + results[(i * c + j) * h * w + s0 * w + s1] = + static_cast(dout[i * out_nstr + j * out_cstr + + s0 * out_hstr + s1 * out_wstr]) - + channel_dot[i * h * w + s0 * w + s1] * + std::exp(out[i * out_nstr + j * out_cstr + s0 * out_hstr + + s1 * out_wstr]); + } + else + { + results[(i * c + j) * h * w + s0 * w + s1] = + static_cast(dout[i * out_nstr + j * out_cstr + + s0 * out_hstr + s1 * out_wstr]) - + channel_dot[i * h * w + s0 * w + s1]; + + results[(i * c + j) * h * w + s0 * w + s1] *= static_cast( + out[i * out_nstr + j * out_cstr + s0 * out_hstr + s1 * out_wstr]); + } + dinhost[i * in_nstr + j * in_cstr + s0 * in_hstr + s1 * in_wstr] = + alpha * results[(i * c + j) * h * w + s0 * w + s1] + + beta * dinhost[i * in_nstr + j * in_cstr + s0 * in_hstr + s1 * in_wstr]; + } + } + } + } + + return ret; +} + +#endif diff --git a/projects/miopen/miopen_utils/include/miopen_utils/network_data.hpp b/projects/miopen/miopen_utils/include/miopen_utils/network_data.hpp new file mode 100644 index 000000000000..987d4dda9929 --- /dev/null +++ b/projects/miopen/miopen_utils/include/miopen_utils/network_data.hpp @@ -0,0 +1,438 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2017 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#ifndef GUARD_MIOPEN_TEST_NETWORK_DATA_HPP +#define GUARD_MIOPEN_TEST_NETWORK_DATA_HPP + +#include +#include +#include +#include + +#ifndef MIOPEN_TEST_DEFAULT_BATCH_SIZE_FACTOR +#define MIOPEN_TEST_DEFAULT_BATCH_SIZE_FACTOR 0 +#endif + +template +inline constexpr T pick_batch_size(T x, T y) +{ + return (y == 0 || y > x) ? 1 : x / y; +} + +// Reduce tests execution time +#define MIOPEN_TESTS_GET_INPUTS_ENABLE_HUGE_TENSORS 1 + +template +inline std::set> get_inputs(T n = MIOPEN_TEST_DEFAULT_BATCH_SIZE_FACTOR) +{ + // clang-format off + return + { + { pick_batch_size(32, n), 1, 14, 14 }, + { pick_batch_size(100, n), 1, 8, 8 }, + { pick_batch_size(256, n), 1, 27, 27 }, +#if MIOPEN_TESTS_GET_INPUTS_ENABLE_HUGE_TENSORS + { pick_batch_size(64, n), 19, 1024,2048}, +#endif + { pick_batch_size(100, n), 3, 32, 32 }, + { pick_batch_size(100, n), 32, 16, 16 }, + { pick_batch_size(100, n), 32, 8, 8 }, + { pick_batch_size(128, n), 256, 12, 12 }, + { pick_batch_size(128, n), 3, 231, 231 }, + { pick_batch_size(128, n), 512, 12, 12 }, + { pick_batch_size(256, n), 256, 13, 13 }, + { pick_batch_size(256, n), 3, 227, 227 }, + { pick_batch_size(256, n), 384, 13, 13 }, + { pick_batch_size(256, n), 96, 27, 27 }, + { pick_batch_size(32, n), 128, 28, 28 }, + { pick_batch_size(32, n), 144, 14, 14 }, + { pick_batch_size(32, n), 192, 28, 28 }, + { pick_batch_size(32, n), 192, 7, 7 }, + { pick_batch_size(32, n), 256, 28, 28 }, + { pick_batch_size(32, n), 3, 224, 224 }, + { pick_batch_size(32, n), 32, 28, 28 }, + { pick_batch_size(32, n), 48, 7, 7 }, + { pick_batch_size(32, n), 480, 128, 256 }, + { pick_batch_size(32, n), 480, 64, 128 }, + { pick_batch_size(32, n), 512, 4, 4 }, + { pick_batch_size(32, n), 512, 64, 128 }, + { pick_batch_size(16, n), 64, 56, 56 }, + { pick_batch_size(32, n), 832, 7, 7 }, + { pick_batch_size(64, n), 128, 56, 56 }, + { pick_batch_size(64, n), 256, 28, 28 }, + { pick_batch_size(64, n), 3, 224, 224 }, + { pick_batch_size(64, n), 512, 28, 28 }, + { pick_batch_size(64, n), 64, 112, 112 }, + { pick_batch_size(32, n), 64, 14, 14 }, + { pick_batch_size(32, n), 192, 14, 14 }, + { pick_batch_size(32, n), 320, 28, 28 }, + { pick_batch_size(32, n), 576, 14, 14 }, + { pick_batch_size(32, n), 576, 4, 4 }, + { pick_batch_size(32, n), 1056, 7, 7 }, + { pick_batch_size(32, n), 2048, 11, 11 }, +#if MIOPEN_TESTS_GET_INPUTS_ENABLE_HUGE_TENSORS + { pick_batch_size(32, n), 16, 2048, 2048 }, + { pick_batch_size(32, n), 16, 3072, 3072 }, + { pick_batch_size(32, n), 16, 4096, 4096 }, +#endif + { 1, 1, 1, 1 } + }; + // clang-format on +} + +template +inline std::set> get_weights(T n = MIOPEN_TEST_DEFAULT_BATCH_SIZE_FACTOR) +{ + // clang-format off + return + { + { pick_batch_size(1024, n),1024, 3, 3 }, + { pick_batch_size(1024, n),512, 3, 3 }, + { pick_batch_size(128, n), 256, 1, 1 }, + { pick_batch_size(128, n), 528, 1, 1 }, + { pick_batch_size(128, n), 96, 3, 3 }, + { pick_batch_size(16, n), 192, 1, 1 }, + { pick_batch_size(224, n), 112, 3, 3 }, + { pick_batch_size(256, n), 96, 5, 5 }, + { pick_batch_size(288, n), 144, 3, 3 }, + { pick_batch_size(48, n), 832, 1, 1 }, + { pick_batch_size(512, n), 256, 3, 3 }, + { pick_batch_size(64, n), 1, 2, 2 }, + { pick_batch_size(64, n), 3, 3, 3 }, + { pick_batch_size(64, n), 3, 7, 7 }, + { pick_batch_size(64, n), 32, 5, 5 }, + { pick_batch_size(64, n), 480, 1, 1 }, + { pick_batch_size(64, n), 64, 1, 1 }, + { pick_batch_size(96, n), 3, 11, 11 }, + { pick_batch_size(192, n), 64, 5, 5 }, + { pick_batch_size(64, n), 64, 3, 3 }, + { pick_batch_size(224, n), 224, 3, 3 }, + { pick_batch_size(224, n), 192, 3, 3 }, + { pick_batch_size(128, n), 320, 1, 1 }, + { pick_batch_size(192, n), 576, 1, 1 }, + { pick_batch_size(128, n), 1056, 1, 1 }, + { pick_batch_size(128, n), 1024, 1, 1 }, + { pick_batch_size(512, n), 2048, 1, 1 } + }; + // clang-format on +} + +template +inline std::set> get_immed_inputs(T n = MIOPEN_TEST_DEFAULT_BATCH_SIZE_FACTOR) +{ + // clang-format off + return + { + { pick_batch_size(32, n), 1, 14, 14 }, + { pick_batch_size(256, n), 1, 27, 27 }, + { pick_batch_size(128, n), 512, 12, 12 }, + { pick_batch_size(256, n), 256, 13, 13 }, + { pick_batch_size(256, n), 3, 227, 227 }, + { pick_batch_size(32, n), 64, 56, 56 }, + { pick_batch_size(32, n), 96, 14, 14 }, + { pick_batch_size(32, n), 96, 28, 28 }, + { pick_batch_size(64, n), 128, 56, 56 }, + { pick_batch_size(64, n), 3, 224, 224 }, + { pick_batch_size(64, n), 256, 14, 14 }, + { 1, 1, 1, 1 } + }; + // clang-format on +} + +template +inline std::set> get_immed_weights(T n = MIOPEN_TEST_DEFAULT_BATCH_SIZE_FACTOR) +{ + // clang-format off + return + { + { pick_batch_size(208, n), 96, 3, 3 }, + { pick_batch_size(24, n), 512, 1, 1 }, + { pick_batch_size(256, n), 128, 3, 3 }, + { pick_batch_size(256, n), 256, 3, 3 }, + { pick_batch_size(256, n), 64, 5, 5 }, + { pick_batch_size(288, n), 144, 3, 3 }, + { pick_batch_size(96, n), 3, 11, 11 }, + { pick_batch_size(32, n), 128, 5, 5 }, + { pick_batch_size(32, n), 128, 1, 1 }, + { pick_batch_size(256, n), 256, 3, 3 }, + { pick_batch_size(512, n), 512, 3, 3 }, + { pick_batch_size(160, n), 128, 3, 3 }, + { pick_batch_size(32, n), 3, 7, 7 } + }; + // clang-format on +} + +template +inline std::set> +get_3d_conv_input_shapes(T n = MIOPEN_TEST_DEFAULT_BATCH_SIZE_FACTOR) +{ + // clang-format off + return + { + { pick_batch_size(128, n), 1, 1, 2, 2}, + { pick_batch_size(128, n), 64, 1, 1, 1}, + { pick_batch_size(128, n), 64, 3, 4, 4}, + { pick_batch_size(352, n), 32, 4, 9, 9}, + { pick_batch_size(192, n), 512, 3, 14, 14}, + { pick_batch_size(352, n), 512, 4, 28, 28}, + { pick_batch_size(256, n), 512, 4, 56, 56}, + { pick_batch_size(192, n), 3, 4, 227, 227}, + { pick_batch_size(128, n), 4, 4, 161, 700} + }; + // clang-format on +} + +template +inline std::set> +get_3d_conv_weight_shapes(T n = MIOPEN_TEST_DEFAULT_BATCH_SIZE_FACTOR) +{ + // clang-format off + return + { + { pick_batch_size( 128, n), 1, 1, 1, 1}, + { pick_batch_size( 352, n), 128, 1, 1, 1}, + { pick_batch_size( 256, n), 128, 1, 1, 1}, + { pick_batch_size( 352, n), 32, 3, 3, 3}, + { pick_batch_size( 352, n), 4, 3, 3, 3}, + { pick_batch_size( 160, n), 4, 3, 5, 5}, + { pick_batch_size( 128, n), 64, 5, 7, 7}, + { pick_batch_size( 192, n), 4, 3, 11, 11}, + { pick_batch_size( 128, n), 1, 3, 1, 7}, + { pick_batch_size( 128, n), 1, 3, 7, 1}, + { pick_batch_size( 128, n), 1, 3, 5, 20} + }; + // clang-format on +} + +template +inline std::set> get_bn_peract_inputs(T n = MIOPEN_TEST_DEFAULT_BATCH_SIZE_FACTOR) +{ + // clang-format off + return + { + { pick_batch_size(32, n), 4, 1024,2048}, //Making this much smaller + { pick_batch_size(100, n), 3, 32, 32 }, + { pick_batch_size(100, n), 32, 8, 8 }, + { pick_batch_size(128, n), 256, 12, 12 }, + { pick_batch_size(256, n), 3, 227, 227 }, + { pick_batch_size(64, n), 64, 112, 112 },//Batch-norm ResNet 152 after this line + { pick_batch_size(256, n), 1024, 14, 14 },// n is from the paper @ 256 + { pick_batch_size(256, n), 2048, 7, 7 }, + { pick_batch_size(256, n), 256, 56, 56 }, + { pick_batch_size(256, n), 256, 14, 14 }, + { pick_batch_size(256, n), 512, 28, 28 }, + { pick_batch_size(256, n), 512, 7, 7 }, + { pick_batch_size(256, n), 64, 112, 112 }, + { pick_batch_size(256, n), 64, 56, 56 },//Batch-norm Inception_v3 after this + { pick_batch_size(32, n), 1024, 1, 1 },// n is from the paper @ 32 + { pick_batch_size(32, n), 128, 14, 14 }, + { pick_batch_size(32, n), 128, 28, 28 }, + { pick_batch_size(32, n), 128, 4, 4 }, + { pick_batch_size(32, n), 128, 7, 7 }, + { pick_batch_size(32, n), 160, 7, 7 }, + { pick_batch_size(32, n), 192, 14, 14 }, + { pick_batch_size(32, n), 192, 56, 56 }, + { pick_batch_size(32, n), 192, 7, 7 }, + { pick_batch_size(32, n), 224, 14, 14 }, + { pick_batch_size(32, n), 256, 7, 7 }, + { pick_batch_size(32, n), 256, 14, 14 }, + { pick_batch_size(32, n), 352, 7, 7 }, + { pick_batch_size(32, n), 64, 112, 112 }, + { pick_batch_size(32, n), 64, 14, 14 }, + { pick_batch_size(32, n), 64, 56, 56 }, + { pick_batch_size(32, n), 96, 28, 28 }, + { pick_batch_size(32, n), 32, 256, 512 }, //Killing this config. Takes way too long on the CPU + { pick_batch_size(32, n), 256, 28, 28 }, + { pick_batch_size(32, n), 3, 224, 224 }, + { pick_batch_size(32, n), 480, 128, 256 }, + { pick_batch_size(32, n), 528, 64, 128 } + }; + // clang-format on +} + +template +inline std::set> get_bn_spatial_inputs(T n = MIOPEN_TEST_DEFAULT_BATCH_SIZE_FACTOR) +{ + // clang-format off + return + { + { pick_batch_size(32, n), 4, 1024,2048}, //Making this much smaller + { pick_batch_size(32, n), 192, 256, 512 }, + { pick_batch_size(32, n), 480, 128, 256 }, + { pick_batch_size(256, n), 3, 227, 227 }, + { pick_batch_size(256, n), 64, 112, 112 }, + { pick_batch_size(512, n), 16, 32, 32 }, + { pick_batch_size(100, n), 32, 8, 8 }, + { pick_batch_size(128, n), 256, 12, 12 }, + { pick_batch_size(256, n), 128, 28, 28 }, + { pick_batch_size(256, n), 2048, 7, 7 }, + { pick_batch_size(256, n), 256, 56, 56 }, + { pick_batch_size(256, n), 256, 14, 14 }, + { pick_batch_size(256, n), 512, 28, 28 }, + { pick_batch_size(256, n), 512, 7, 7 }, + { pick_batch_size(256, n), 64, 56, 56 },//Batch-norm Inception_v3 after this + { pick_batch_size(32, n), 1024, 1, 1 },// n is from the paper @ 32 + { pick_batch_size(32, n), 128, 14, 14 }, + { pick_batch_size(32, n), 128, 4, 4 }, + { pick_batch_size(32, n), 160, 7, 7 }, + { pick_batch_size(32, n), 192, 14, 14 }, + { pick_batch_size(32, n), 192, 56, 56 }, + { pick_batch_size(32, n), 192, 7, 7 }, + { pick_batch_size(32, n), 224, 14, 14 }, + { pick_batch_size(32, n), 256, 7, 7 }, + { pick_batch_size(32, n), 352, 7, 7 }, + { pick_batch_size(32, n), 64, 14, 14 }, + { pick_batch_size(32, n), 64, 28, 28 }, + { pick_batch_size(32, n), 64, 56, 56 }, + { pick_batch_size(32, n), 96, 28, 28 }, + { pick_batch_size(32, n), 192, 256, 512 }, + { pick_batch_size(32, n), 256, 28, 28 }, + { pick_batch_size(32, n), 3, 224, 224 }, + { pick_batch_size(32, n), 480, 128, 256 }, + { pick_batch_size(32, n), 528, 64, 128 }, + { pick_batch_size(770, n), 1, 8, 8 }, + { pick_batch_size(770, n), 1024, 1, 1 }, + { pick_batch_size(152, n), 128, 80, 80 }, + { pick_batch_size(152, n), 256, 20, 20 }, + { pick_batch_size(152, n), 32, 160, 160 }, + { pick_batch_size(152, n), 512, 20, 20 }, + { pick_batch_size(152, n), 64, 160, 160 }, + { pick_batch_size(152, n), 64, 80, 80 }, + { pick_batch_size(256, n), 256, 20, 20 }, + { pick_batch_size(256, n), 512, 20, 20 } + }; + // clang-format on +} + +template +inline std::set> get_3d_bn_peract_inputs(T n = MIOPEN_TEST_DEFAULT_BATCH_SIZE_FACTOR) +{ + // clang-format off + return + { + { pick_batch_size(32, n), 1, 32, 32, 32 }, // 32x32x32 based on VoxNet arch + { pick_batch_size(32, n), 1, 14, 14, 14 }, + { pick_batch_size(32, n), 32, 14, 14, 14 }, + { pick_batch_size(32, n), 32, 12, 12, 12 }, + { pick_batch_size(32, n), 32, 6, 6, 6 }, + { pick_batch_size(256, n), 1, 32, 32, 32 }, // 32x32x32 based on VoxNet arch + { pick_batch_size(256, n), 32, 14, 14, 14 }, + { pick_batch_size(256, n), 32, 12, 12, 12 }, + { pick_batch_size(256, n), 32, 6, 6, 6 }, + { pick_batch_size(512, n), 1, 32, 32, 32 }, // 32x32x32 based on VoxNet arch + { pick_batch_size(512, n), 32, 14, 14, 14 }, + { pick_batch_size(512, n), 32, 12, 12, 12 }, + { pick_batch_size(512, n), 32, 6, 6, 6 }, + { pick_batch_size(32, n), 2, 32, 57, 125 }, // Hand-gesture recognition CVPR 2015 paper High Res Net Path + { pick_batch_size(32, n), 32, 14, 25, 59 }, + { pick_batch_size(32, n), 32, 6, 10, 27 }, + { pick_batch_size(32, n), 32, 4, 6, 11 }, + { pick_batch_size(32, n), 32, 2, 2, 3 }, + { pick_batch_size(32, n), 32, 32, 28, 62 }, // Hand-gesture recognition CVPR 2015 paper Low Res Net Path + { pick_batch_size(32, n), 32, 14, 12, 29 }, + { pick_batch_size(32, n), 32, 6, 4, 12 }, + { pick_batch_size(32, n), 32, 4, 2, 2 }, + { pick_batch_size(16, n), 32, 6, 50, 50 }, // Multi-view 3D convnet + { pick_batch_size(1, n), 3, 8, 240, 320 }, // 3D convet on video + { pick_batch_size(1, n), 3, 16, 240, 320 }, // 3D convet on video + { pick_batch_size(1, n), 3, 8, 128, 171 }, // 3D convet on video + { pick_batch_size(1, n), 3, 16, 128, 171 }, // 3D convet on video + { pick_batch_size(1, n), 3, 8, 112, 112 }, // 3D convet on video + { pick_batch_size(1, n), 3, 16, 112, 112 } // 3D convet on video + }; + + // clang-format on +} + +template +inline std::set> +get_3d_bn_spatial_inputs(T n = MIOPEN_TEST_DEFAULT_BATCH_SIZE_FACTOR) +{ + // clang-format off + return + { + { pick_batch_size(32, n), 1, 32, 32, 32 }, // 32x32x32 based on VoxNet arch + { pick_batch_size(32, n), 1, 14, 14, 14 }, + { pick_batch_size(32, n), 32, 14, 14, 14 }, + { pick_batch_size(32, n), 32, 12, 12, 12 }, + { pick_batch_size(32, n), 32, 6, 6, 6 }, + { pick_batch_size(256, n), 1, 32, 32, 32 }, // 32x32x32 based on VoxNet arch + { pick_batch_size(256, n), 32, 14, 14, 14 }, + { pick_batch_size(256, n), 32, 12, 12, 12 }, + { pick_batch_size(256, n), 32, 6, 6, 6 }, + { pick_batch_size(512, n), 1, 32, 32, 32 }, // 32x32x32 based on VoxNet arch + { pick_batch_size(512, n), 32, 14, 14, 14 }, + { pick_batch_size(512, n), 32, 12, 12, 12 }, + { pick_batch_size(512, n), 32, 6, 6, 6 }, + { pick_batch_size(32, n), 2, 32, 57, 125 }, // Hand-gesture recognition CVPR 2015 paper High Res Net Path + { pick_batch_size(32, n), 32, 14, 25, 59 }, + { pick_batch_size(32, n), 32, 6, 10, 27 }, + { pick_batch_size(32, n), 32, 4, 6, 11 }, + { pick_batch_size(32, n), 32, 2, 2, 3 }, + { pick_batch_size(32, n), 32, 32, 28, 62 }, // Hand-gesture recognition CVPR 2015 paper Low Res Net Path + { pick_batch_size(32, n), 32, 14, 12, 29 }, + { pick_batch_size(32, n), 32, 6, 4, 12 }, + { pick_batch_size(32, n), 32, 4, 2, 2 }, + { pick_batch_size(16, n), 32, 6, 50, 50 }, // Multi-view 3D convnet + { pick_batch_size(1, n), 3, 8, 240, 320 }, // 3D convet on video + { pick_batch_size(1, n), 3, 16, 240, 320 }, // 3D convet on video + { pick_batch_size(1, n), 3, 8, 128, 171 }, // 3D convet on video + { pick_batch_size(1, n), 3, 16, 128, 171 }, // 3D convet on video + { pick_batch_size(1, n), 3, 8, 112, 112 }, // 3D convet on video + { pick_batch_size(1, n), 3, 16, 112, 112 } // 3D convet on video + }; + // clang-format on +} + +template +inline std::vector> get_sub_tensor() +{ + return {{16, 4, 8, 1, 4}, + {2, 4, 8, 8, 4}, + {16, 4, 8, 4}, + {13, 8, 4, 8}, + {3, 8, 7}, + {16, 4, 10}, + {3, 8}, + {16, 4}, + {4}}; +} + +template +inline std::vector> get_tensor_offsets() +{ + static_assert(std::is_signed_v); + return {{0, 0}, {0, 2}, {4, 0}, {5, 7}}; +} + +template +inline std::vector get_tensor_offset() +{ + static_assert(std::is_signed_v); + return {0, 1, 2, 3, 4, 5}; +} + +#endif diff --git a/projects/miopen/miopen_utils/include/miopen_utils/random.hpp b/projects/miopen/miopen_utils/include/miopen_utils/random.hpp new file mode 100644 index 000000000000..63b69ac9875a --- /dev/null +++ b/projects/miopen/miopen_utils/include/miopen_utils/random.hpp @@ -0,0 +1,62 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#ifndef GUARD_MIOPEN_TEST_RANDOM_HPP +#define GUARD_MIOPEN_TEST_RANDOM_HPP + +#include + +namespace prng { +template +inline T gen_descreet_uniform_sign(double scale, int32_t range) +{ + return static_cast(scale * prng::gen_A_to_B(-range + 1, range)); +} + +template +inline T gen_descreet_unsigned(double scale, int32_t range) +{ + return static_cast(scale * static_cast(gen_0_to_B(range))); +} + +} // namespace prng + +// lambda factory +template +auto uniform_signed_initializer(ScaleT scale_arg, RangeT range_arg) +{ + return [=](auto&&...) -> T { + // uniform sign give balance of both negative and positive values + return prng::gen_descreet_uniform_sign(scale_arg, range_arg); + }; +} + +template +auto uniform_unsigned_initializer(ScaleT scale_arg, RangeT range_arg) +{ + return [=](auto&&...) -> T { return prng::gen_descreet_unsigned(scale_arg, range_arg); }; +} + +#endif // GUARD_MIOPEN_TEST_RANDOM_HPP diff --git a/projects/miopen/miopen_utils/include/miopen_utils/rnn_util.hpp b/projects/miopen/miopen_utils/include/miopen_utils/rnn_util.hpp new file mode 100644 index 000000000000..a6569cebb7e6 --- /dev/null +++ b/projects/miopen/miopen_utils/include/miopen_utils/rnn_util.hpp @@ -0,0 +1,305 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2023 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#ifndef MIOPEN_RNN_UTIL_H_ +#define MIOPEN_RNN_UTIL_H_ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include + +// complexity O(NlogN) +inline std::vector GetReverseOrderIndex(const std::vector& base_index) +{ + std::vector reverse_index(base_index.size()); + unsigned next_rev_index = 0; + for(auto id : base_index) + reverse_index[id] = next_rev_index++; + return reverse_index; +}; + +inline std::vector GetSamplesIndexDescendingOrder(const std::vector& unsorted_seq_lens) +{ + const auto sample_count = unsorted_seq_lens.size(); + + std::vector index_v(sample_count); + std::iota(index_v.begin(), index_v.end(), 0); + + auto seq_len_cmp = [&unsorted_seq_lens](unsigned a_id, unsigned b_id) { + return unsorted_seq_lens[a_id] > unsorted_seq_lens[b_id]; + }; + + std::stable_sort(index_v.begin(), index_v.end(), seq_len_cmp); + + return index_v; +} + +template +inline void HiddenTensorReorder(const std::vector& src_array, + std::vector& dst_array, + const std::vector& batch_order, + const std::vector hid_len, + bool is_dst_direct_order) +{ + const size_t copy_size = hid_len[2]; + + const size_t batch_stride = hid_len[2]; + const size_t layer_stride = batch_stride * hid_len[1]; + + for(size_t batch_id = 0; batch_id < hid_len[1]; batch_id++) + { + const auto src_batch_off = + batch_stride * (is_dst_direct_order ? batch_order[batch_id] : batch_id); + const auto dst_batch_off = + batch_stride * (is_dst_direct_order ? batch_id : batch_order[batch_id]); + + for(size_t layer_id = 0; layer_id < hid_len[0]; layer_id++) + { + const auto dst_offset = dst_batch_off + layer_id * layer_stride; + const auto src_offset = src_batch_off + layer_id * layer_stride; + + std::copy(src_array.begin() + src_offset, + src_array.begin() + src_offset + copy_size, + dst_array.begin() + dst_offset); + } + } +} + +inline void createTensorDescArray(std::vector& td, + std::vector& ptd, + const std::vector bs, + const int secondDim, + miopenDataType_t dataType) +{ + + std::transform(bs.begin(), bs.end(), std::back_inserter(td), [&](int x) { + return miopen::TensorDescriptor( + dataType, {static_cast(x), static_cast(secondDim)}); + }); + std::transform(td.begin(), td.end(), std::back_inserter(ptd), [](miopen::TensorDescriptor& x) { + return &x; + }); +} + +inline std::tuple +GetTempPackedBuffersSize(std::vector batchs, int in_vec, int out_vec) +{ + size_t total_batch = std::accumulate(batchs.begin(), batchs.end(), 0ULL); + + size_t in_buff_size = total_batch * in_vec; + size_t out_buff_size = total_batch * out_vec; + return {in_buff_size, out_buff_size}; +} + +inline size_t getSuperTensorSize(const std::vector& bs, + int seqLength, + int inputSize, + int hiddenSize, + int maxPaddingVal, + bool isBidirect, + bool isInput, + bool isPadded) +{ + return (isPadded // + ? static_cast(seqLength) * maxPaddingVal + : std::accumulate(bs.begin(), bs.end(), 0ULL)) // + * (isInput // + ? static_cast(inputSize) + : static_cast(hiddenSize) * (isBidirect ? 2 : 1)); +} + +template +void ChangeDataPadding(const std::vector& src_array, + std::vector& dst_array, + const std::vector& batch_list, + int max_batch, + int sample_size, + bool is_src_packed) +{ + auto seq_len = batch_list.size(); + + auto scr_ptr = &src_array[0]; + auto dst_ptr = &dst_array[0]; + + for(int seq_id = 0; seq_id < seq_len; seq_id++) + { + auto packed_size = batch_list[seq_id] * sample_size; + + std::copy(scr_ptr, scr_ptr + packed_size, dst_ptr); + + if(is_src_packed) + { + dst_ptr += max_batch * sample_size; + scr_ptr += packed_size; + } + else + { + scr_ptr += max_batch * sample_size; + dst_ptr += packed_size; + } + } +} + +// RNN VANILLA configs +inline std::vector get_rnn_num_layers() { return {{1, 3}}; } + +inline std::vector get_rnn_batchSize() { return {{1, 17}}; } + +inline std::vector get_rnn_seq_len() { return {{1, 3, 51}}; } + +inline std::vector get_rnn_vector_len() { return {31}; } + +inline std::vector get_rnn_hidden_size() { return {127}; } + +// LSTM configs +inline std::vector get_lstm_num_layers() { return {{1, 3}}; } + +inline std::vector get_lstm_batchSize() { return {{1, 17}}; } + +inline std::vector get_lstm_seq_len() { return {{1, 25}}; } + +inline std::vector get_lstm_vector_len() { return {17}; } + +inline std::vector get_lstm_hidden_size() { return {67}; } + +// GRU configs +inline std::vector get_gru_num_layers() { return {{1, 3}}; } + +inline std::vector get_gru_batchSize() { return {{1, 17}}; } + +inline std::vector get_gru_seq_len() { return {{1, 23}}; } + +inline std::vector get_gru_vector_len() { return {13}; } + +inline std::vector get_gru_hidden_size() { return {67}; } + +inline std::vector> generate_batchSeq(const int batchSize, const int seqLength) +{ + + static constexpr int modval = 3; + + int currentval = batchSize; + std::vector batchSeq; + batchSeq.reserve(seqLength); + for(int i = 0; i < seqLength; i++) + { + if(i > 0) + { + int nvalue = currentval - prng::gen_0_to_B(modval); + currentval = (nvalue < 1) ? 1 : nvalue; + // printf("current value: %d\n", currentval); + } + // printf("adding a value to batch sequence: %d\n", currentval); + batchSeq.push_back(currentval); + } + return {batchSeq}; +} + +inline int sumvc(const std::vector& x) { return std::accumulate(x.begin(), x.end(), 0); } + +template +inline T activfunc(T x, int actvf) +{ + T alpha = static_cast(1), beta0 = static_cast(0), beta1 = static_cast(1); + if(actvf == 0) + { + return (x > 0) ? x : x * beta0; + } + else if(actvf == 2) + { + return static_cast(1 / (1 + std::exp(-x))); + } + return static_cast(alpha * std::tanh(beta1 * x)); +} + +template +inline T dervactivfunc(T x, int actvf) +{ + if(actvf == 0) + { + return static_cast(x > 0 ? 1 : 0); + } + else if(actvf == 2) + { + return static_cast(std::exp(-x) / (1 + std::exp(-x)) / (1 + std::exp(-x))); + } + + return static_cast(1 / std::cosh(x) / std::cosh(x)); +} + +template +void RNN_mm_cpu_batched(const Dtype* a_ptr, + size_t a_cols, + size_t a_rows, + size_t lda, + size_t a_stride, + int a_flags, + const Dtype* b_ptr, + size_t b_cols, + size_t b_rows, + size_t ldb, + size_t b_stride, + int b_flags, + Dtype* c_ptr, + size_t c_cols, + size_t c_rows, + size_t ldc, + size_t c_stride, + int batchCount, + double alpha, + double beta) +{ + for(int i = 0; i < batchCount; ++i) + { + gemm_cpu(a_ptr + a_stride * i, + a_cols, + a_rows, + lda, + a_flags == 1 ? true : false, + b_ptr + b_stride * i, + b_cols, + b_rows, + ldb, + b_flags == 1 ? true : false, + c_ptr + c_stride * i, + c_cols, + c_rows, + ldc, + alpha, + beta); + } +} + +#endif diff --git a/projects/miopen/miopen_utils/include/miopen_utils/serialize.hpp b/projects/miopen/miopen_utils/include/miopen_utils/serialize.hpp new file mode 100644 index 000000000000..71d3133df063 --- /dev/null +++ b/projects/miopen/miopen_utils/include/miopen_utils/serialize.hpp @@ -0,0 +1,129 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2018 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#ifndef MIOPEN_GUARD_TEST_SERIALIZE_HPP +#define MIOPEN_GUARD_TEST_SERIALIZE_HPP + +#include +#include +#include +#include +#include +#include +#include + +template +struct is_trivial_serializable : std::is_trivially_copy_constructible +{ +}; + +template <> +struct is_trivial_serializable : std::true_type +{ +}; + +template +std::enable_if_t{}> serialize(std::ostream& os, const T& x) +{ + os.write(reinterpret_cast(&x), sizeof(T)); +} + +template +auto serialize(std::ostream& os, + const T& x) -> decltype(x.begin(), x.end(), T(x.begin(), x.end()), void()) +{ + std::size_t n = std::distance(x.begin(), x.end()); + serialize(os, n); + for(auto&& y : x) + serialize(os, y); +} + +template +std::enable_if_t>{}> +serialize(std::ostream& os, const std::tuple& t) +{ + miopen::unpack( + [&](auto&&... xs) { miopen::each_args([&](auto&& x) { serialize(os, x); }, xs...); }, t); +} + +template +std::enable_if_t{}> serialize(std::istream& is, T& x) +{ + is.read(reinterpret_cast(&x), sizeof(T)); +} + +template +std::enable_if_t{}> serialize(std::istream& is, std::vector& x) +{ + std::size_t n; + serialize(is, n); + x.resize(n); + is.read(reinterpret_cast(x.data()), sizeof(T) * n); +} + +template +auto serialize(std::istream& is, + T& x) -> decltype(x.begin(), x.end(), x.assign(x.begin(), x.end()), void()) +{ + using value_type = std::decay_t; + std::size_t n; + serialize(is, n); + std::vector v; + v.reserve(n); + for(std::size_t i = 0; i < n; i++) + { + value_type y; + serialize(is, y); + v.push_back(y); + } + x.assign(v.begin(), v.end()); +} + +template +std::enable_if_t>{}> +serialize(std::istream& is, + // cppcheck-suppress constParameter + std::tuple& t) +{ + miopen::unpack( + [&](auto&&... xs) { miopen::each_args([&](auto&& x) { serialize(is, x); }, xs...); }, t); +} + +template +void load(std::string name, T& x) +{ + std::ifstream is{name.c_str()}; + serialize(is, x); +} + +template +void save(std::string name, const T& x) +{ + std::ofstream os{name.c_str()}; + serialize(os, x); +} + +#endif diff --git a/projects/miopen/miopen_utils/include/miopen_utils/tensor_holder.hpp b/projects/miopen/miopen_utils/include/miopen_utils/tensor_holder.hpp new file mode 100644 index 000000000000..f762f80f280c --- /dev/null +++ b/projects/miopen/miopen_utils/include/miopen_utils/tensor_holder.hpp @@ -0,0 +1,505 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2017 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#ifndef GUARD_TENSOR_HOLDER_HPP +#define GUARD_TENSOR_HOLDER_HPP + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +using half = half_float::half; +using hip_bfloat16 = bfloat16; +#include "../../src/kernels/hip_float8.hpp" +using float8_fnuz = miopen_f8::hip_f8; +using bfloat8_fnuz = miopen_f8::hip_f8; + +#include +#include + +template +void visit_tensor_size(std::size_t n, F f) +{ + switch(n) + { + case 0: { + f(std::integral_constant{}); + break; + } + case 1: { + f(std::integral_constant{}); + break; + } + case 2: { + f(std::integral_constant{}); + break; + } + case 3: { + f(std::integral_constant{}); + break; + } + case 4: { + f(std::integral_constant{}); + break; + } + case 5: { + f(std::integral_constant{}); + break; + } + default: throw std::runtime_error("Unknown tensor size"); + } +} + +template +struct miopen_type; + +template <> +struct miopen_type : std::integral_constant +{ +}; + +template <> +struct miopen_type : std::integral_constant +{ +}; + +template <> +struct miopen_type : std::integral_constant +{ +}; +template <> +struct miopen_type : std::integral_constant +{ +}; + +template <> +struct miopen_type : std::integral_constant +{ +}; + +template <> +struct miopen_type : std::integral_constant +{ +}; + +template <> +struct miopen_type : std::integral_constant +{ +}; + +template <> +struct miopen_type : std::integral_constant +{ +}; + +template <> +struct miopen_type : std::integral_constant +{ +}; + +template <> +struct miopen_type : std::integral_constant +{ +}; + +template <> +struct miopen_type : std::integral_constant +{ +}; + +template <> +struct miopen_type : std::integral_constant +{ +}; + +template +struct tensor +{ + using value_type = T; + miopen::TensorDescriptor desc; + std::vector data; + +#if defined(__clang__) || defined(__GNUG__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wdeprecated-declarations" +#endif + + tensor() : desc(miopen_type{}) {} + +#if defined(__clang__) || defined(__GNUG__) +#pragma GCC diagnostic pop +#endif + + template + tensor(const std::vector& dims) : desc(miopen_type{}, dims), data(desc.GetElementSpace()) + { + } + + template + tensor(const std::vector& dims, const std::vector& strides) + : desc(miopen_type{}, dims, strides), data(desc.GetElementSpace()) + { + assert(dims.size() == strides.size()); + } + + template + tensor(miopenTensorLayout_t layout, const std::vector& dims) + : desc(miopen_type{}, layout, dims), data(desc.GetElementSpace()) + { + } + + template + tensor(miopenTensorLayout_t layout, const std::vector& dims, const std::vector& strides) + : desc(miopen_type{}, layout, dims, strides), data(desc.GetElementSpace()) + { + assert(dims.size() == strides.size()); + } + + tensor(std::size_t n, std::size_t c, std::size_t h, std::size_t w) + : desc(miopen_type{}, {n, c, h, w}), data(n * c * h * w) + { + } + + tensor(miopenTensorLayout_t layout, std::size_t n, std::size_t c, std::size_t h, std::size_t w) + : desc(miopen_type{}, layout, {n, c, h, w}), data(desc.GetElementSpace()) + { + } + + tensor(std::size_t n, std::size_t c, std::size_t d, std::size_t h, std::size_t w) + : desc(miopen_type{}, {n, c, d, h, w}), data(n * c * d * h * w) + { + } + + tensor(std::size_t n) : desc(miopen_type{}, {n}), data(n) {} + + tensor(miopen::TensorDescriptor rhs) : desc(std::move(rhs)) + { + assert(desc.GetType() == miopen_type{} + /// In the driver, T is input tensor type, but output tensor holders + /// are instantiatied with T as well. This leads to false assertion + /// failures when T is INT8 because output type is different. + /// \todo Get rid of this hack when the driver is improved: + || (miopen_type{} == miopenInt8 && desc.GetType() == miopenInt32)); + data.resize(desc.GetElementSpace()); + } + + size_t GetDataByteSize() const { return GetSize() * sizeof(T); } + + size_t GetSize() const { return desc.GetElementSpace(); } + + template + tensor& generate(G g) & + { + if(this->desc.GetVectorLength() > 1) + this->generate_vect_impl(g); + else + this->generate_impl(g); + return *this; + } + + template + tensor&& generate(G g) && + { + if(this->desc.GetVectorLength() > 1) + this->generate_vect_impl(g); + else + this->generate_impl(g); + return std::move(*this); + } + + template + void generate_impl(G g) + { + auto seed = std::accumulate(desc.GetLengths().begin(), + desc.GetLengths().end(), + std::size_t{521288629}, + [](auto x, auto y) { + x ^= x << 1U; + return x ^ y; + }); + seed ^= data.size(); + seed ^= desc.GetLengths().size(); + prng::reset_seed(seed); + auto iterator = data.begin(); + auto assign = [&](T x) { + *iterator = x; + ++iterator; + }; + this->for_each( + miopen::compose(miopen::compose(assign, miopen::cast_to()), std::move(g))); + } + + template + void generate_vect_impl(G g) + { + auto seed = std::accumulate(desc.GetLengths().begin(), + desc.GetLengths().end(), + std::size_t{521288629}, + [](auto x, auto y) { + x ^= x << 1U; + return x ^ y; + }); + seed ^= data.size(); + seed ^= desc.GetLengths().size(); + prng::reset_seed(seed); + auto iterator = data.begin(); + auto vectorLength = desc.GetVectorLength(); + auto assign = [&](T x) { + assert(iterator < data.end()); + // for debugging + for(auto i = 0; i < vectorLength; i++) + { + *(iterator + i) = x; + } + iterator += vectorLength; + }; + this->for_each( + miopen::compose(miopen::compose(assign, miopen::cast_to()), std::move(g))); + } + + template + struct for_each_unpacked + { + Loop loop; + F f; + template + auto operator()(Ts... xs) const -> decltype(f(xs...), void()) + { + loop(xs...)(std::move(f)); + } + + struct any + { + any() {} + template + any(X) + { + } + }; + + [[noreturn]] void operator()(any = {}, + any = {}, + any = {}, + any = {}, + any = {}, + any = {}, + any = {}, + any = {}, + any = {}) const + { + throw std::runtime_error( + "Arguments to for_each do not match tensor size or the function " + + miopen::get_type_name() + " can not be called."); + } + }; + + struct for_each_handler + { + template + void operator()(Self* self, Loop loop, F f, Size size) const + { + auto dims = miopen::tien(self->desc.GetLengths()); + miopen::unpack(for_each_unpacked{loop, std::move(f)}, dims); + } + }; + + template + void for_each(F f) const + { + visit_tensor_size( + desc.GetLengths().size(), + std::bind(for_each_handler{}, this, miopen::ford, std::move(f), std::placeholders::_1)); + } + + template + void par_for_each(F f) const + { + visit_tensor_size( + desc.GetLengths().size(), + std::bind( + for_each_handler{}, this, miopen::par_ford, std::move(f), std::placeholders::_1)); + } + + template + T& operator()(Ts... xs) + { + assert(this->desc.GetIndex(xs...) < data.size()); + return this->data[this->desc.GetIndex(xs...)]; + } + + template + const T& operator()(Ts... xs) const + { + assert(this->desc.GetIndex(xs...) < data.size()); + return this->data[this->desc.GetIndex(xs...)]; + } + + template + const T& operator()(const std::array& multi_id) const + { + auto f = [&](auto... is) { return this->desc.GetIndex(is...); }; + assert(miopen::unpack(f, multi_id) < data.size()); + return this->data[miopen::unpack(f, multi_id)]; + } + + T& operator[](std::size_t i) { return data.at(i); } + + const T& operator[](std::size_t i) const { return data.at(i); } + + typename std::vector::iterator begin() { return data.begin(); } + + typename std::vector::iterator end() { return data.end(); } + + typename std::vector::const_iterator begin() const { return data.begin(); } + + typename std::vector::const_iterator end() const { return data.end(); } + + friend std::ostream& operator<<(std::ostream& stream, const tensor& t) + { + return stream << t.desc; + } + + template + void dump_inner(size_t dim, std::array& coord, Stream& stream) const + { + const auto lengths = this->desc.GetLengths(); + if(lengths.size() == 0) + { + // 0D special case: Just print the one value that we have and return. + stream << (*this)(coord); + } + else if(dim + 1 == lengths.size()) + { + // 1D special case: dump everything on one line + for(size_t i = 0; i < lengths[dim]; ++i) + { + if(i != 0) + stream << ' '; + + coord[dim] = i; + stream << std::setw(4) << (*this)(coord); + } + + stream << '\n'; + } + else + { + if(dim + 2 == lengths.size()) + { + // 2D special case: Also print which 2D slice we are currently printing + // Note: this is not needed for higher dimensions, as they will also pass + // through this branch. + stream << "slice ["; + for(size_t i = 0; i < dim; ++i) + { + stream << coord[i] << ", "; + } + stream << ":, :]\n"; + } + + for(size_t i = 0; i < lengths[dim]; ++i) + { + coord[dim] = i; + this->dump_inner(dim + 1, coord, stream); + } + } + } + + template + void dump(const char* name, Stream& stream = std::cout) const + { + const auto n = this->desc.GetLengths().size(); + stream << "==== " << name << ": " << *this << n << '\n'; + stream.fill(' '); + + const auto flags = stream.flags(); + + visit_tensor_size(n, [&](const auto size) { + constexpr size_t N = decltype(size)::value; + std::array coord; + this->dump_inner(0, coord, stream); + }); + + stream.flags(flags); + } +}; + +template +void serialize(std::istream& s, tensor& x) +{ + std::vector lens; + serialize(s, lens); + std::vector strides; + serialize(s, strides); + x.desc = miopen::TensorDescriptor{miopen_type{}, lens, strides}; + serialize(s, x.data); +} + +template +void serialize(std::ostream& s, const tensor& x) +{ + const auto& lens = x.desc.GetLengths(); + const auto& strides = x.desc.GetStrides(); + serialize(s, lens); + serialize(s, strides); + serialize(s, x.data); +} + +struct tensor_generate +{ + template + Tensor&& operator()(Tensor&& t, G g) const + { + return std::forward(t.generate(g)); + } +}; + +struct tensor_elem_gen_integer +{ + uint64_t max_value = 17; + + template + double operator()(Ts... Xs) const + { + static_assert(sizeof...(Ts) < 6, + "Dimensions in tensor_elem_gen_integer must be less than 6."); + assert(max_value > 0); + std::array left = {{Xs...}}; + std::array right = {{613, 547, 701, 877, 1049}}; + uint64_t dot = + std::inner_product(left.begin(), left.end(), right.begin(), static_cast(173)); + return static_cast(dot % max_value); + } +}; + +#endif diff --git a/projects/miopen/miopen_utils/include/miopen_utils/verify.hpp b/projects/miopen/miopen_utils/include/miopen_utils/verify.hpp new file mode 100644 index 000000000000..81af2afbcf2d --- /dev/null +++ b/projects/miopen/miopen_utils/include/miopen_utils/verify.hpp @@ -0,0 +1,245 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2017 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#ifndef GUARD_VERIFY_HPP +#define GUARD_VERIFY_HPP + +#include +#include +#include +#include +#include +#include +#include +#include +using half = half_float::half; +using hip_bfloat16 = bfloat16; +#include +#include + +namespace miopen { + +// Compute the value of a range +template +using range_value = typename std::decay().begin())>::type; + +struct sum_fn +{ + template + auto operator()(T x, U y) const MIOPEN_RETURNS(x + y); +}; +static constexpr sum_fn sum{}; + +struct max_fn +{ + template + static T id(T x) + { + return x; + } + + template + auto operator()(T x, U y) const MIOPEN_RETURNS(max_fn::id(x > y ? x : y)); +}; +static constexpr max_fn max{}; + +namespace abs_diff_detail { +using std::fabs; +struct fn +{ + template + auto operator()(T x, U y) const MIOPEN_RETURNS(fabs(x - y)); +}; + +} // namespace abs_diff_detail + +static constexpr abs_diff_detail::fn abs_diff{}; + +struct not_finite_fn +{ + template ), bool>::type = false> + bool operator()(T x) const + { + return !std::isfinite(x); + } + + template ::type, half_float::half>), + bool>::type = false> + bool operator()(T x) const + { + return !half_float::isfinite(x); + } + + template ::type, bfloat16>), + bool>::type = false> + bool operator()(T x) const + { + return !std::isfinite(x); // bfloat16 has float() conversion operator + } + + template ), bool>::type = false> + bool operator()(T x) const + { + std::ignore = x; + return false; + } +}; +static constexpr not_finite_fn not_finite{}; + +template +T as(T, U x) +{ + return x; +} + +struct compare_mag_fn +{ + template + bool operator()(T x, U y) const + { + using std::fabs; + return fabs(x) < fabs(y); + } +}; +static constexpr compare_mag_fn compare_mag{}; + +struct square_diff_fn +{ + template + double operator()(T x, U y) const + { + double diff = static_cast(x - y); + return diff * diff; + } +}; +static constexpr square_diff_fn square_diff{}; + +template , bool> = true> +bool equal_values(T const& lhs, T const& rhs) +{ + return lhs == rhs; +} + +template , bool> = true> +bool equal_values(T const& lhs, T const& rhs) +{ + return miopen::float_equal_sentinel(lhs, rhs); +} + +template +bool range_empty(R1&& r1) +{ + return r1.begin() == r1.end(); +} + +template +auto range_distance(R1&& r1) MIOPEN_RETURNS(std::distance(r1.begin(), r1.end())); + +template +bool range_zero(const std::vector& r) +{ + return std::all_of(r.begin(), r.end(), [](T x) { return equal_values(x, T()); }); +} + +template +bool range_zero(const tensor& r) +{ + return range_zero(r.data); +} + +template +T range_product(R1&& r1, R2&& r2, T state, Reducer r, Product p) +{ + return std::inner_product(r1.begin(), r1.end(), r2.begin(), state, r, p); +} + +template +std::size_t mismatch_idx(R1&& r1, R2&& r2, Compare compare) +{ + auto p = std::mismatch(r1.begin(), r1.end(), r2.begin(), compare); + return std::distance(r1.begin(), p.first); +} + +template +int64_t find_idx(R1&& r1, Predicate p) +{ + auto it = std::find_if(r1.begin(), r1.end(), p); + if(it == r1.end()) + return -1; + else + return std::distance(r1.begin(), it); +} + +template +double max_diff(R1&& r1, R2&& r2) +{ + return range_product(r1, r2, 0.0, max, abs_diff); +} + +template +auto max_diff_v2(R1&& r1, R2&& r2) +{ + using T = decltype(r1[0] - r2[0]); + auto abs_diff_func = [](auto x, auto y) { return x > y ? x - y : y - x; }; + // BUG: deduced wrong datatype, half_float bug + if constexpr(std::is_same_v) + return range_product(r1, r2, half_float::half(), max, abs_diff_func); + else + return range_product(r1, r2, T(), max, abs_diff_func); +} + +template +std::size_t mismatch_diff(R1&& r1, R2&& r2, T diff) +{ + return mismatch_idx( + r1, + r2, + std::bind( + float_equal, diff, std::bind(abs_diff, std::placeholders::_1, std::placeholders::_2))); +} + +template +double rms_range(R1&& r1, R2&& r2) +{ + std::size_t n = range_distance(r1); + if(n == range_distance(r2)) + { + if(n == 0) + return 0; + double square_difference = range_product(r1, r2, 0.0, sum_fn{}, square_diff); + double mag1 = static_cast(*std::max_element(r1.begin(), r1.end(), compare_mag)); + double mag2 = static_cast(*std::max_element(r2.begin(), r2.end(), compare_mag)); + double mag = + std::max({std::fabs(mag1), std::fabs(mag2), std::numeric_limits::min()}); + return std::sqrt(square_difference) / (std::sqrt(n) * mag); + } + else + return double(std::numeric_limits>::max()); +} +} // namespace miopen +#endif diff --git a/projects/miopen/speedtests/CMakeLists.txt b/projects/miopen/speedtests/CMakeLists.txt index 9aa89974cc75..826da17b59db 100644 --- a/projects/miopen/speedtests/CMakeLists.txt +++ b/projects/miopen/speedtests/CMakeLists.txt @@ -16,7 +16,7 @@ function(add_speedtest_executable TEST_NAME) endif() separate_arguments(MIOPEN_TEST_FLAGS_ARGS NATIVE_COMMAND ${MIOPEN_TEST_FLAGS}) # MIOpen_with_plugins ensures CK plugin .so's are built alongside the speedtest - target_link_libraries(${TEST_NAME} MIOpen_with_plugins) + target_link_libraries(${TEST_NAME} MIOpen_with_plugins miopen_common_utils miopen_utils) target_include_directories(${TEST_NAME} PRIVATE ../test ../src/kernels) endfunction(add_speedtest_executable) diff --git a/projects/miopen/src/CMakeLists.txt b/projects/miopen/src/CMakeLists.txt index 9e6f401b7506..84bbd53716fb 100644 --- a/projects/miopen/src/CMakeLists.txt +++ b/projects/miopen/src/CMakeLists.txt @@ -8,14 +8,7 @@ if(MIOPEN_ENABLE_SQLITE) add_subdirectory(sqlite) endif() -# Truncation rounding or (default) rounding to nearest even (RNE) is enabled. -# This switch controls two related but different aspects of MIOpen behavior -# 1. How host code performs conversions of float to bfloat16, important only -# for testing. -# 2. How BF16 kernels (which are kind of mixed-precision now and expected to -# remain in the future) perform final conversion (and rounding) of FP32 -# to BF16 results. This affects the main functionality of the library. -option( MIOPEN_USE_RNE_BFLOAT16 "Sets rounding scheme for bfloat16 type" ON ) +# MIOPEN_USE_RNE_BFLOAT16 is declared in the top-level CMakeLists.txt. option( MIOPEN_FP8_IEEE_EXPONENT_BIAS "Sets the FP8 exponent bias to IEEE" OFF) option( MIOPEN_FP8_CLIPPING "Sets the FP8 clipping" ON) @@ -931,7 +924,7 @@ endif() target_include_directories(MIOpen SYSTEM PUBLIC $) # Workaround : change in rocm-cmake was causing linking error so had to add ${CMAKE_DL_LIBS} # We can remove ${CMAKE_DL_LIBS} once root cause is identified. -target_link_libraries(MIOpen PRIVATE ${CMAKE_DL_LIBS} Threads::Threads BZip2::BZip2) +target_link_libraries(MIOpen PRIVATE ${CMAKE_DL_LIBS} Threads::Threads BZip2::BZip2 miopen_common_utils) miopen_generate_export_header(MIOpen) if(WIN32) diff --git a/projects/miopen/src/ck_impl/CMakeLists.txt b/projects/miopen/src/ck_impl/CMakeLists.txt index ae380f174007..791250958533 100644 --- a/projects/miopen/src/ck_impl/CMakeLists.txt +++ b/projects/miopen/src/ck_impl/CMakeLists.txt @@ -145,7 +145,7 @@ foreach(gpu_target IN LISTS _CK_FILTERED_TARGETS) target_link_libraries(${lib_name} PRIVATE hip::device) # Link against MIOpen for shared types (ConvSolution, InvokerFactory, etc.) - target_link_libraries(${lib_name} PRIVATE MIOpen) + target_link_libraries(${lib_name} PRIVATE MIOpen miopen_common_utils) # Install alongside MIOpen install(TARGETS ${lib_name} diff --git a/projects/miopen/src/include/miopen/algorithm.hpp b/projects/miopen/src/include/miopen/algorithm.hpp index d1098a066077..38b87c1e38b4 100644 --- a/projects/miopen/src/include/miopen/algorithm.hpp +++ b/projects/miopen/src/include/miopen/algorithm.hpp @@ -1,47 +1,2 @@ -/******************************************************************************* - * - * MIT License - * - * Copyright (c) 2019 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - *******************************************************************************/ -#ifndef GUARD_MLOPEN_ALGORITHM_HPP -#define GUARD_MLOPEN_ALGORITHM_HPP - -#include - -namespace miopen { - -template -bool any_of(const Range& r, Predicate p) -{ - return std::any_of(r.begin(), r.end(), p); -} - -template -bool all_of(const Range& r, Predicate p) -{ - return std::all_of(r.begin(), r.end(), p); -} - -} // namespace miopen - -#endif +// Forwarding header — implementation moved to common_utils. +#include diff --git a/projects/miopen/src/include/miopen/bfloat16.hpp b/projects/miopen/src/include/miopen/bfloat16.hpp index 3e3a184a72d1..eab3c5b2c826 100644 --- a/projects/miopen/src/include/miopen/bfloat16.hpp +++ b/projects/miopen/src/include/miopen/bfloat16.hpp @@ -1,179 +1,2 @@ -// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. -// SPDX-License-Identifier: MIT - -#ifndef BFLOAT16_H_ -#define BFLOAT16_H_ - -#include -#include - -class bfloat16 -{ -public: - bfloat16() : data_{0} {} - explicit bfloat16(float rhs) - { - union - { - float float_st; - std::uint32_t bf16_st; - } bits_st = {rhs}; - - // BF16 round and NaN preservation code matches - // https://github.com/ROCm/rocBLAS/blob/develop/library/include/rocblas_bfloat16.h - if((~bits_st.bf16_st & 0x7f800000) == 0) // Inf or NaN - { - // When all of the exponent bits are 1, the value is Inf or NaN. - // Inf is indicated by a zero mantissa. NaN is indicated by any nonzero - // mantissa bit. Quiet NaN is indicated by the most significant mantissa - // bit being 1. Signaling NaN is indicated by the most significant - // mantissa bit being 0 but some other bit(s) being 1. If any of the - // lower 16 bits of the mantissa are 1, we set the least significant bit - // of the bfloat16 mantissa, in order to preserve signaling NaN in case - // the bloat16's mantissa bits are all 0. - if((bits_st.bf16_st & 0xffff) != 0) - { - bits_st.bf16_st |= 0x10000; // Preserve signaling NaN - } - } - else - { -#if MIOPEN_USE_RNE_BFLOAT16 == 1 - // When the exponent bits are not all 1s, then the value is zero, normal, - // or subnormal. We round the bfloat16 mantissa up by adding 0x7FFF, plus - // 1 if the least significant bit of the bfloat16 mantissa is 1 (odd). - // This causes the bfloat16's mantissa to be incremented by 1 if the 16 - // least significant bits of the float mantissa are greater than 0x8000, - // or if they are equal to 0x8000 and the least significant bit of the - // bfloat16 mantissa is 1 (odd). This causes it to be rounded to even when - // the lower 16 bits are exactly 0x8000. If the bfloat16 mantissa already - // has the value 0x7f, then incrementing it causes it to become 0x00 and - // the exponent is incremented by one, which is the next higher FP value - // to the unrounded bfloat16 value. When the bfloat16 value is subnormal - // with an exponent of 0x00 and a mantissa of 0x7F, it may be rounded up - // to a normal value with an exponent of 0x01 and a mantissa of 0x00. - // When the bfloat16 value has an exponent of 0xFE and a mantissa of 0x7F, - // incrementing it causes it to become an exponent of 0xFF and a mantissa - // of 0x00, which is Inf, the next higher value to the unrounded value. - bits_st.bf16_st += - (0x7fff + ((bits_st.bf16_st >> 16) & 1)); // Round to nearest, round to even -#else // truncation -// do nothing -#endif - } - data_ = bits_st.bf16_st >> 16; - } - operator float() const - { - union - { - std::uint32_t bf16_st; - float float_st; - } bits_st = {data_}; - - bits_st.bf16_st = bits_st.bf16_st << 16; - return bits_st.float_st; - } - - bfloat16 operator-() const { return bfloat16(-static_cast(*this)); } - bfloat16 operator+() const { return *this; } - - bfloat16& operator=(const float rhs) - { - *this = bfloat16(rhs); - return *this; - } - bfloat16& operator+=(bfloat16 rhs) - { - *this = bfloat16(static_cast(*this) + static_cast(rhs)); - return *this; - } - - bfloat16& operator+=(float rhs) - { - *this = bfloat16(static_cast(*this) + rhs); - return *this; - } - - bfloat16& operator-=(bfloat16 rhs) - { - *this += -rhs; - return *this; - } - bfloat16& operator*=(bfloat16 rhs) - { - *this = bfloat16(static_cast(*this) * static_cast(rhs)); - return *this; - } - bfloat16& operator*=(float rhs) - { - *this = bfloat16(static_cast(*this) * rhs); - return *this; - } - - bfloat16& operator/=(bfloat16 rhs) - { - *this = bfloat16(static_cast(*this) / static_cast(rhs)); - return *this; - } - bool operator<(bfloat16 rhs) const - { - return static_cast(*this) < static_cast(rhs); - } - bool operator==(bfloat16 rhs) const { return std::equal_to()(*this, rhs); } - - static constexpr bfloat16 generate(uint16_t val) { return bfloat16{val, true}; } - -private: - constexpr bfloat16(std::uint16_t val, bool) : data_{val} {} - - std::uint16_t data_; -}; - -inline bfloat16 operator+(bfloat16 a, const bfloat16& b) -{ - a += b; - return a; -} - -inline bfloat16 operator-(bfloat16 a, const bfloat16& b) -{ - a -= b; - return a; -} - -inline bfloat16 operator*(bfloat16 a, const bfloat16& b) -{ - a *= b; - return a; -} - -inline bfloat16 operator/(bfloat16 a, const bfloat16& b) -{ - a /= b; - return a; -} - -namespace std { -template <> -class numeric_limits -{ -public: - static constexpr bool is_specialized = true; - static constexpr bfloat16 min() noexcept { return bfloat16::generate(0x0080); } // 0x1.00p-126 - static constexpr bfloat16 max() noexcept { return bfloat16::generate(0x7F7F); } - static constexpr bfloat16 lowest() noexcept { return bfloat16::generate(0xFF7F); } - static constexpr bfloat16 epsilon() noexcept { return bfloat16::generate(0x3C00); } - static constexpr bfloat16 infinity() noexcept { return bfloat16::generate(0x7F80); } - static constexpr bfloat16 quiet_NaN() noexcept { return bfloat16::generate(0x7FC0); } // qnan(0) - static constexpr bfloat16 signaling_NaN() noexcept - { - return bfloat16::generate(0x7F81); // snan(1) - } - static constexpr bfloat16 denorm_min() noexcept - { - return bfloat16::generate(0x0001); // 0x0.02p-126 - } -}; -} // namespace std -#endif +// Forwarding header — implementation moved to common_utils. +#include diff --git a/projects/miopen/src/include/miopen/each_args.hpp b/projects/miopen/src/include/miopen/each_args.hpp index e078153dc998..983c7da843dd 100644 --- a/projects/miopen/src/include/miopen/each_args.hpp +++ b/projects/miopen/src/include/miopen/each_args.hpp @@ -1,79 +1,2 @@ -/******************************************************************************* - * - * MIT License - * - * Copyright (c) 2017 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - *******************************************************************************/ -#ifndef GUARD_MIOPEN_EACH_ARGS_HPP -#define GUARD_MIOPEN_EACH_ARGS_HPP - -#include -#include -#include - -namespace miopen { -namespace detail { - -template -void each_args_i_impl(F f, std::index_sequence, Ts&&... xs) -{ - (void)std::initializer_list{ - (f(std::integral_constant{}, std::forward(xs)), 0)...}; -} - -template -auto unpack_impl(F f, std::index_sequence, T&& x) -{ - return f(std::get(x)...); -} - -} // namespace detail - -template -void each_args_i(F f, Ts&&... xs) -{ - detail::each_args_i_impl(f, std::make_index_sequence(), std::forward(xs)...); -} - -template -void each_args(F f, Ts&&... xs) -{ - (void)std::initializer_list{(f(std::forward(xs)), 0)...}; -} - -// Workaround for gcc warnings -template -void each_args(F) -{ -} - -template -auto unpack(F f, T&& x) -{ - using type = typename std::remove_cv::type>::type; - return detail::unpack_impl( - f, std::make_index_sequence::value>(), std::forward(x)); -} - -} // namespace miopen - -#endif +// Forwarding header — implementation moved to common_utils. +#include diff --git a/projects/miopen/src/include/miopen/float_equal.hpp b/projects/miopen/src/include/miopen/float_equal.hpp index 24bbdc55ad11..a48c2e417489 100644 --- a/projects/miopen/src/include/miopen/float_equal.hpp +++ b/projects/miopen/src/include/miopen/float_equal.hpp @@ -1,89 +1,2 @@ -/******************************************************************************* - * - * MIT License - * - * Copyright (c) 2017 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - *******************************************************************************/ -#ifndef GUARD_MLOPEN_FLOAT_EQUAL_HPP -#define GUARD_MLOPEN_FLOAT_EQUAL_HPP - -#include -#include -#include -#include - -namespace miopen { - -template -using common_type = typename std::common_type::type; - -struct float_equal_fn -{ - template - static bool apply(T x, T y) - { - // The standard library from MSVC does not implement std::isfinite() for integer - // types - no additional overloads are provided. According to the documentation, - // integer types should be treaded as doubles. - // Refer to https://en.cppreference.com/w/cpp/numeric/math/isfinite for more information. - return std::isfinite(static_cast(x)) and std::isfinite(static_cast(y)) and - std::nextafter(x, std::numeric_limits::lowest()) <= y and - std::nextafter(x, std::numeric_limits::max()) >= y; - } - - template - bool operator()(T x, U y) const - { - return float_equal_fn::apply>(x, y); - } -}; - -static constexpr float_equal_fn float_equal{}; - -/// Special case for comparing with a sentinel value -struct float_equal_sentinel_fn -{ - template - static bool apply(T x, T y) - { -// In this case we have to ignore this warning, because we intend to compare with the exact value -#pragma clang diagnostic push -#pragma clang diagnostic ignored "-Wfloat-equal" - bool equals_sentinel = x == y; -#pragma clang diagnostic pop - - return std::isfinite(static_cast(x)) and std::isfinite(static_cast(y)) and - equals_sentinel; - } - - template - bool operator()(T x, U y) const - { - return float_equal_sentinel_fn::apply>(x, y); - } -}; - -static constexpr float_equal_sentinel_fn float_equal_sentinel{}; - -} // namespace miopen - -#endif +// Forwarding header — implementation moved to common_utils. +#include diff --git a/projects/miopen/src/include/miopen/ford.hpp b/projects/miopen/src/include/miopen/ford.hpp index f56b20de4d46..beac57e1e6e8 100644 --- a/projects/miopen/src/include/miopen/ford.hpp +++ b/projects/miopen/src/include/miopen/ford.hpp @@ -1,122 +1,2 @@ -/******************************************************************************* - * - * MIT License - * - * Copyright (c) 2017 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - *******************************************************************************/ -#ifndef GUARD_FORD_HPP -#define GUARD_FORD_HPP - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#include - -namespace miopen { - -// An improved async, that doesn't block -template -std::future::type> detach_async(Function&& f) -{ - using result_type = typename std::invoke_result::type; - std::packaged_task task(std::forward(f)); - auto fut = task.get_future(); - std::thread(std::move(task)).detach(); - return fut; -} - -template -auto then(std::future f, Work w) -> std::future -{ - return std::async(std::launch::deferred, - [=, f_ = std::move(f)]() mutable { return w(f_.get()); }); -} - -template -struct ford_wrapper -{ - template - auto operator()(Ts... xs) const MIOPEN_RETURNS(std::bind(T{}, std::placeholders::_1, xs...)); -}; - -// Multidimensional for loop -struct ford_impl -{ - template - void operator()(F f) const - { - f(); - } - - template - void operator()(F f, T x, Ts... xs) const - { - // Workaround for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=55914 - for(T i = 0; i < x; i++) - { - (*this)([&](Ts... is) { f(i, is...); }, xs...); - } - } -}; - -static constexpr ford_wrapper ford{}; - -struct par_ford_impl -{ - template - void operator()(F f, Ts... xs) const - { - using array_type = std::array; - array_type lens = {{static_cast(xs)...}}; - array_type strides; - strides.fill(1); - std::partial_sum( - lens.rbegin(), lens.rend() - 1, strides.rbegin() + 1, std::multiplies()); - auto size = std::accumulate( - lens.begin(), lens.end(), static_cast(1), std::multiplies()); - par_for(size, [&](std::size_t i) { - array_type indices; - std::transform(strides.begin(), - strides.end(), - lens.begin(), - indices.begin(), - [&](size_t stride, size_t len) { return (i / stride) % len; }); - unpack(f, indices); - }); - } -}; - -static constexpr ford_wrapper par_ford{}; - -} // namespace miopen - -#endif +// Forwarding header — implementation moved to common_utils. +#include diff --git a/projects/miopen/src/include/miopen/functional.hpp b/projects/miopen/src/include/miopen/functional.hpp index 02c6e3427e87..d0a70ae6794d 100644 --- a/projects/miopen/src/include/miopen/functional.hpp +++ b/projects/miopen/src/include/miopen/functional.hpp @@ -1,131 +1,2 @@ -/******************************************************************************* - * - * MIT License - * - * Copyright (c) 2017 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - *******************************************************************************/ -#ifndef GUARD_MLOPEN_FUNCTIONAL_HPP -#define GUARD_MLOPEN_FUNCTIONAL_HPP - -#include -#include -#include - -namespace miopen { -namespace detail { - -template -auto each_i_impl(F f, std::index_sequence) - MIOPEN_RETURNS(f(std::integral_constant{}...)); -} // namespace detail - -template -struct by_t -{ - F f; - P p; - template - auto operator()(Ts&&... xs) const MIOPEN_RETURNS(f(p(std::forward(xs))...)) -}; - -template -by_t by(F f, P p) -{ - return {std::move(f), std::move(p)}; -} - -template -struct compose_t -{ - F f; - G g; - template - auto operator()(Ts&&... xs) const MIOPEN_RETURNS(f(g(std::forward(xs)...))) -}; - -template -compose_t compose(F f, G g) -{ - return {std::move(f), std::move(g)}; -} - -template -struct flip_t -{ - F f; - template - auto operator()(T&& x, U&& y) const MIOPEN_RETURNS(f(std::forward(y), std::forward(x))) -}; - -template -flip_t flip(F f) -{ - return {std::move(f)}; -} - -template -struct sequence_t -{ - F f; - template - auto operator()(IntegralConstant) const - MIOPEN_RETURNS(detail::each_i_impl(f, std::make_index_sequence())); -}; - -template -sequence_t sequence(F f) -{ - return {std::move(f)}; -} - -template -void repeat_n(F f, std::integral_constant) -{ - auto fs = [&f](auto... is) { return each_args(f, is...); }; - sequence(fs)(std::integral_constant{}); -} - -template -struct cast_to -{ - template - T operator()(X&& x) const - { - return static_cast(std::forward(x)); - } -}; - -template -auto unpacker(F f) -{ - return [=](auto xs) { return miopen::unpack(f, xs); }; -}; - -template -auto prepender(F f, Xs... xs) -{ - return [=](auto... ys) { return f(xs..., ys...); }; -} - -} // namespace miopen - -#endif +// Forwarding header — implementation moved to common_utils. +#include diff --git a/projects/miopen/src/include/miopen/par_for.hpp b/projects/miopen/src/include/miopen/par_for.hpp index 1272dcf6ac9b..4685b005db77 100644 --- a/projects/miopen/src/include/miopen/par_for.hpp +++ b/projects/miopen/src/include/miopen/par_for.hpp @@ -1,149 +1,2 @@ -/******************************************************************************* - * - * MIT License - * - * Copyright (c) 2020 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - *******************************************************************************/ - -#ifndef MIOPEN_GUARD_MLOPEN_PAR_FOR_HPP -#define MIOPEN_GUARD_MLOPEN_PAR_FOR_HPP - -#include -#include -#include -#include -#include -#include - -#include - -namespace miopen { - -struct joinable_thread : std::thread -{ - template - joinable_thread(Xs&&... xs) : std::thread(std::forward(xs)...) // NOLINT - { - } - - joinable_thread& operator=(joinable_thread&& other) = default; - joinable_thread(joinable_thread&& other) = default; - - ~joinable_thread() - { - if(this->joinable()) - this->join(); - } -}; - -struct thread_factory -{ - template - joinable_thread operator()(std::size_t& work, std::size_t n, std::size_t grainsize, F f) const - { - auto result = joinable_thread([=] { - std::size_t start = work; - std::size_t last = std::min(n, work + grainsize); - for(std::size_t i = start; i < last; i++) - { - f(i); - } - }); - work += grainsize; - return result; - } -}; - -template -void par_for_impl(std::size_t n, std::size_t threadsize, F f) -{ - if(threadsize <= 1) - { - for(std::size_t i = 0; i < n; i++) - f(i); - } - else - { - std::vector threads(threadsize); - const std::size_t grainsize = std::ceil(static_cast(n) / threads.size()); - - std::size_t work = 0; - std::generate(threads.begin(), - threads.end(), - std::bind(thread_factory{}, std::ref(work), n, grainsize, f)); - assert(work >= n); - } -} - -template -void par_for(std::size_t n, std::size_t min_grain, F f) -{ - const auto threadsize = - std::min(std::thread::hardware_concurrency(), n / min_grain); - par_for_impl(n, threadsize, f); -} - -struct min_grain -{ - std::size_t n = 0; -}; - -template -void par_for(std::size_t n, min_grain mg, F f) -{ - const auto threadsize = std::min(std::thread::hardware_concurrency(), n / mg.n); - par_for_impl(n, threadsize, f); -} - -template -void par_for(std::size_t n, F f) -{ - par_for(n, min_grain{8}, f); -} - -struct max_threads -{ - std::size_t n = 0; -}; - -template -void par_for(std::size_t n, max_threads mt, F f) -{ - const auto threadsize = std::min(std::thread::hardware_concurrency(), mt.n); - par_for_impl(n, std::min(threadsize, n), f); -} - -template -void par_for_strided(std::size_t n, max_threads mt, F f) -{ - auto threadsize = std::min(std::thread::hardware_concurrency(), mt.n); - par_for_impl(threadsize, threadsize, [&](auto start) { - for(std::size_t i = start; i < n; i += threadsize) - { - f(i); - } - }); -} - -} // namespace miopen - -#endif +// Forwarding header — implementation moved to common_utils. +#include diff --git a/projects/miopen/src/include/miopen/rank.hpp b/projects/miopen/src/include/miopen/rank.hpp index 013ec6e7f7f4..88a4541421d4 100644 --- a/projects/miopen/src/include/miopen/rank.hpp +++ b/projects/miopen/src/include/miopen/rank.hpp @@ -1,42 +1,2 @@ -/******************************************************************************* - * - * MIT License - * - * Copyright (c) 2017 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - *******************************************************************************/ -#ifndef GUARD_MIOPEN_RANK_HPP -#define GUARD_MIOPEN_RANK_HPP - -namespace miopen { - -template -struct rank : rank -{ -}; - -template <> -struct rank<0> -{ -}; -} // namespace miopen - -#endif +// Forwarding header — implementation moved to common_utils. +#include diff --git a/projects/miopen/src/include/miopen/reduce_common.hpp b/projects/miopen/src/include/miopen/reduce_common.hpp index 37b92e727d92..8d47ee0f05b0 100644 --- a/projects/miopen/src/include/miopen/reduce_common.hpp +++ b/projects/miopen/src/include/miopen/reduce_common.hpp @@ -1,66 +1,2 @@ -/******************************************************************************* - * - * MIT License - * - * Copyright (c) 2020 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - *******************************************************************************/ -#ifndef GUARD_MIOPEN_REDUCE_COMMON_HPP -#define GUARD_MIOPEN_REDUCE_COMMON_HPP - -#include -#include - -namespace reduce { - -template -static inline Tdst convert_type(Tsrc x) -{ - return static_cast(x); -} - -template <> -inline float convert_type(half_float::half x) -{ - return half_float::half_cast(x); -}; - -template <> -inline half_float::half convert_type(float x) -{ - return half_float::half_cast(x); -}; - -template <> -inline float convert_type(bfloat16 x) -{ - return float(x); -}; - -template <> -inline bfloat16 convert_type(float x) -{ - return bfloat16(x); -}; - -}; // end of namespace reduce - -#endif +// Forwarding header — implementation moved to common_utils. +#include diff --git a/projects/miopen/src/include/miopen/returns.hpp b/projects/miopen/src/include/miopen/returns.hpp index 4fdb1db18b87..8bd3067fdea3 100644 --- a/projects/miopen/src/include/miopen/returns.hpp +++ b/projects/miopen/src/include/miopen/returns.hpp @@ -1,38 +1,2 @@ -/******************************************************************************* - * - * MIT License - * - * Copyright (c) 2017 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - *******************************************************************************/ - -#ifndef GUARD_MIOPEN_RETURNS_HPP -#define GUARD_MIOPEN_RETURNS_HPP - -#define MIOPEN_RETURNS(...) \ - ->decltype(__VA_ARGS__) { return __VA_ARGS__; } - -#define MIOPEN_BODY_RETURNS(...) \ - { \ - return __VA_ARGS__; \ - } - -#endif +// Forwarding header — implementation moved to common_utils. +#include diff --git a/projects/miopen/src/include/miopen/stringutils.hpp b/projects/miopen/src/include/miopen/stringutils.hpp index 5a412416d666..168eb6bee75e 100644 --- a/projects/miopen/src/include/miopen/stringutils.hpp +++ b/projects/miopen/src/include/miopen/stringutils.hpp @@ -1,165 +1,2 @@ -/******************************************************************************* - * - * MIT License - * - * Copyright (c) 2017 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - *******************************************************************************/ -#ifndef GUARD_MIOPEN_STRINGUTILS_HPP -#define GUARD_MIOPEN_STRINGUTILS_HPP - -#include -#include -#include -#include -#include -#include -#include -#include - -#define MIOPEN_STRINGIZE_1(...) #__VA_ARGS__ -#define MIOPEN_STRINGIZE(...) MIOPEN_STRINGIZE_1(__VA_ARGS__) - -namespace miopen { - -inline std::string -ReplaceString(const std::string& in, const std::string& search, const std::string& replace) -{ - size_t pos = 0; - std::string subject(in); - while((pos = subject.find(search, pos)) != std::string::npos) - { - subject.replace(pos, search.length(), replace); - pos += replace.length(); - } - return subject; -} - -inline bool EndsWith(const std::string& value, const std::string& suffix) -{ - if(suffix.size() > value.size()) - return false; - else - return std::equal(suffix.rbegin(), suffix.rend(), value.rbegin()); -} - -template -inline std::string JoinStrings(Strings strings, std::string delim) -{ - auto it = strings.begin(); - if(it == strings.end()) - return ""; - - auto nit = std::next(it); - return std::accumulate( - nit, strings.end(), *it, [&](std::string x, std::string y) { return x + delim + y; }); -} - -template -static inline std::string TransformString(std::string s, F f) -{ - std::transform(s.begin(), s.end(), s.begin(), f); - return s; -} - -inline std::string ToUpper(std::string s) { return TransformString(std::move(s), ::toupper); } - -inline bool StartsWith(const std::string& value, const std::string& prefix) -{ - if(prefix.size() > value.size()) - return false; - else - return std::equal(prefix.begin(), prefix.end(), value.begin()); -} - -inline std::string RemovePrefix(std::string s, std::string prefix) -{ - if(StartsWith(s, prefix)) - return s.substr(prefix.length()); - else - return s; -} - -inline std::vector SplitSpaceSeparated(const std::string& in) -{ - std::istringstream ss(in); - const std::istream_iterator begin(ss), end; - return {begin, end}; -} - -inline std::vector SplitSpaceSeparated(const std::vector& in) -{ - std::vector rv; - for(const auto& item : in) - { - if(item.find(' ') != std::string::npos) - { - const auto splitted = SplitSpaceSeparated(item); - std::copy(splitted.begin(), splitted.end(), std::back_inserter(rv)); - } - else - { - rv.emplace_back(item); - } - } - return rv; -} - -inline std::vector SplitSpaceSeparated(const std::string& in, - const std::vector& dontSplitAfter) -{ - std::vector rv; - std::istringstream ss(in); - std::string s; - while(ss >> s) - { - if(any_of(dontSplitAfter, [&](const auto& dont) { return dont == s; })) - { - std::string s2; - if(ss >> s2) - { - s += std::string(" ").append(s2); // Exactly one space is important. - rv.push_back(s); - continue; - } - MIOPEN_THROW("Error parsing string: '" + in + '\''); - } - rv.push_back(s); - } - return rv; -} - -inline std::vector SplitDelim(const std::string& in, const char delim) -{ - std::vector rv; - std::string token; - std::istringstream ss(in); - - while(std::getline(ss, token, delim)) - { - rv.push_back(token); - } - return rv; -} - -} // namespace miopen - -#endif // GUARD_MIOPEN_STRINGUTILS_HPP +// Forwarding header — implementation moved to common_utils. +#include diff --git a/projects/miopen/src/include/miopen/type_name.hpp b/projects/miopen/src/include/miopen/type_name.hpp index ac7fd2ff6017..4f4afd78def0 100644 --- a/projects/miopen/src/include/miopen/type_name.hpp +++ b/projects/miopen/src/include/miopen/type_name.hpp @@ -1,139 +1,2 @@ -/******************************************************************************* - * - * MIT License - * - * Copyright (c) 2017-2024 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - *******************************************************************************/ - -#ifndef GUARD_TYPE_NAME_HPP -#define GUARD_TYPE_NAME_HPP - -#include -#include -#if defined(_MSC_VER) && !defined(__clang__) && !defined(__GNUC__) -#include -#endif - -namespace miopen { - -template -constexpr std::string_view type_name() -{ -#if defined(__clang__) || defined(__GNUC__) - // clang or gcc - constexpr auto full_name = std::string_view{__PRETTY_FUNCTION__}; -#elif defined(_MSC_VER) - // msvc - constexpr auto full_name = std::string_view{__FUNCSIG__}; -#endif - - // The substring with the data type name is located within the original string, between the - // prefix and the suffix, with the prefix always not at the beginning of the string and the - // suffix always at the end of the string. -#if defined(__clang__) - // clang - constexpr auto prefix = std::string_view{"[T = "}; - constexpr auto suffix = std::string_view{"]"}; -#elif defined(__GNUC__) - // gcc - constexpr auto prefix = std::string_view{"[with T = "}; - constexpr auto suffix = std::string_view{"; std::string_view = std::basic_string_view]"}; -#elif defined(_MSC_VER) - // msvc - constexpr auto prefix = std::string_view{"type_name<"}; - constexpr auto suffix = std::string_view{">(void)"}; -#endif - - constexpr auto prefix_pos = full_name.find(prefix); - static_assert(prefix_pos != std::string_view::npos); - - constexpr auto suffix_pos = full_name.rfind(suffix); - static_assert(suffix_pos != std::string_view::npos); - static_assert(suffix_pos == full_name.size() - suffix.size()); - - constexpr auto pos = prefix_pos + prefix.size(); - static_assert(pos < suffix_pos); - constexpr auto count = suffix_pos - pos; - - constexpr auto name = full_name.substr(pos, count); - -#if defined(__clang__) || defined(__GNUC__) - // clang or gcc - return name; -#elif defined(_MSC_VER) - // msvc - if constexpr(std::is_compound_v) - { - // For compound data types, the string contains the keyword 'class/struct/union/enum' before - // the data type name, separated by a space. - constexpr auto sep = std::string_view{" "}; - constexpr auto sep_pos = name.find(sep); - static_assert(sep_pos != std::string_view::npos); - static_assert(sep_pos != 0); // must not be at the 0 position - - constexpr auto name_pos = sep_pos + sep.size(); - constexpr auto tname = name.substr(name_pos); - static_assert(tname.size() > 0); - - return tname; - } - else - { - return name; - } -#endif -} - -template -constexpr std::string_view type_name_bare() -{ - constexpr auto name = type_name(); - constexpr auto pos = name.rfind(':'); - if constexpr(pos == std::string_view::npos) - { - constexpr auto result = name; - return result; - } - else - { - constexpr auto bare_name = name.substr(pos + 1); - static_assert(bare_name.size() > 0); - return bare_name; - } -} - -template -const std::string& get_type_name() -{ - static const auto ret = std::string(type_name()); - return ret; -} - -template -const std::string& get_type_name(const T&) -{ - return miopen::get_type_name(); -} - -} // namespace miopen - -#endif +// Forwarding header — implementation moved to common_utils. +#include diff --git a/projects/miopen/test/CMakeLists.txt b/projects/miopen/test/CMakeLists.txt index 57601d45ceaf..035f1314fc63 100755 --- a/projects/miopen/test/CMakeLists.txt +++ b/projects/miopen/test/CMakeLists.txt @@ -414,9 +414,9 @@ function(add_test_executable TEST_NAME) endif() # MIOpen_with_plugins ensures CK plugin .so's are built alongside the test if(NOT MIOPEN_EMBED_DB STREQUAL "") - target_link_libraries(${TEST_NAME} MIOpen_with_plugins miopen_data) + target_link_libraries(${TEST_NAME} MIOpen_with_plugins miopen_data miopen_common_utils miopen_utils) else() - target_link_libraries(${TEST_NAME} MIOpen_with_plugins) + target_link_libraries(${TEST_NAME} MIOpen_with_plugins miopen_common_utils miopen_utils) endif() target_include_directories(${TEST_NAME} PRIVATE ../src/kernels) if(WIN32) diff --git a/projects/miopen/test/cpu_bias.hpp b/projects/miopen/test/cpu_bias.hpp index 9b0c2578feef..2abbcccde0da 100644 --- a/projects/miopen/test/cpu_bias.hpp +++ b/projects/miopen/test/cpu_bias.hpp @@ -1,141 +1,2 @@ -/******************************************************************************* - * - * MIT License - * - * Copyright (c) 2019 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - *******************************************************************************/ -#ifndef GUARD_CPU_BIAS_HPP -#define GUARD_CPU_BIAS_HPP - -#include "test.hpp" -#include -#include -#include -#include -#include -#include -#include -#include - -#include "tensor_holder.hpp" -#include -#include - -template -void cpu_bias_forward_impl(tensor& out, const tensor& bias) -{ - assert(out.desc.GetNumDims() == NSpatialDim + 2 and bias.desc.GetNumDims() == NSpatialDim + 2); - assert( - bias.desc.GetLengths()[0] == 1 && bias.desc.GetLengths()[1] == out.desc.GetLengths()[1] && - std::all_of(bias.desc.GetLengths().begin() + 2, bias.desc.GetLengths().end(), [](auto v) { - return v == 1; - })); - - out.par_for_each([&](auto out_n_id, auto out_k_id, auto... out_spatial_id_pack) { - out(out_n_id, out_k_id, out_spatial_id_pack...) = - double(out(out_n_id, out_k_id, out_spatial_id_pack...)) + double(bias.data[out_k_id]); - }); -} - -template -void cpu_bias_backward_data_impl(const tensor& out, tensor& bias) -{ - assert(out.desc.GetNumDims() == NSpatialDim + 2 and bias.desc.GetNumDims() == NSpatialDim + 2); - assert( - bias.desc.GetLengths()[0] == 1 && bias.desc.GetLengths()[1] == out.desc.GetLengths()[0] && - std::all_of(bias.desc.GetLengths().begin() + 2, bias.desc.GetLengths().end(), [](auto v) { - return v == 1; - })); - - std::size_t out_n_len = out.desc.GetLengths()[0]; - std::size_t out_k_len = out.desc.GetLengths()[1]; - - std::array out_spatial_len{}; - std::copy_n(out.desc.GetLengths().begin() + 2, NSpatialDim, out_spatial_len.begin()); - - miopen::par_ford(out_k_len)([&](auto out_k_id) { - auto ford_out_n_spatial = - miopen::unpacker(miopen::prepender(miopen::ford, out_n_len))(out_spatial_len); - - double acc = 0; - ford_out_n_spatial([&](auto out_n_id, auto... out_spatial_id_pack) { - acc += double(out(out_n_id, out_k_id, out_spatial_id_pack...)); - }); - - bias.data[out_k_id] = acc; - }); -} - -template -void cpu_bias_forward(tensor& out, const tensor& bias) -{ - switch(out.desc.GetNumDims()) - { - case 3: { - cpu_bias_forward_impl<1>(out, bias); - break; - } - case 4: { - cpu_bias_forward_impl<2>(out, bias); - break; - } - case 5: { - cpu_bias_forward_impl<3>(out, bias); - break; - } - case 6: { - cpu_bias_forward_impl<4>(out, bias); - break; - } - default: { - MIOPEN_THROW("not belong to any case"); - } - } -} - -template -void cpu_bias_backward_data(const tensor& out, tensor& bias) -{ - switch(out.desc.GetNumDims()) - { - case 3: { - cpu_bias_backward_data_impl<1>(out, bias); - break; - } - case 4: { - cpu_bias_backward_data_impl<2>(out, bias); - break; - } - case 5: { - cpu_bias_backward_data_impl<3>(out, bias); - break; - } - case 6: { - cpu_bias_backward_data_impl<4>(out, bias); - break; - } - default: { - MIOPEN_THROW("not belong to any case"); - } - } -} -#endif +// Forwarding header — implementation moved to miopen_utils. +#include diff --git a/projects/miopen/test/cpu_conv.hpp b/projects/miopen/test/cpu_conv.hpp index 895262311b12..818e215c45e2 100644 --- a/projects/miopen/test/cpu_conv.hpp +++ b/projects/miopen/test/cpu_conv.hpp @@ -1,515 +1,2 @@ -/******************************************************************************* - * - * MIT License - * - * Copyright (c) 2019 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - *******************************************************************************/ -#ifndef GUARD_CPU_CONV_HPP -#define GUARD_CPU_CONV_HPP - -#include "test.hpp" -#include -#include -#include -#include -#include -#include -#include -#include - -#include "tensor_holder.hpp" -#include -#include -#include - -template -static constexpr auto make_array(T x, Ts... xs) -{ - return std::array{{x, xs...}}; -} - -template -struct PassThru -{ - T operator()(T t) { return t; } -}; - -template -struct cpu_convolution_acc_type -{ - using type = double; // default using double as accumulator -}; - -template <> -struct cpu_convolution_acc_type -{ - using type = int32_t; -}; - -template <> -struct cpu_convolution_acc_type -{ - using type = double; -}; - -template -void cpu_convolution_forward_impl(const tensor& in, - const tensor& wei, - tensor& out, - const Range& pads, - const Range& strides, - const Range& dilations, - std::size_t group_count, - FI fi = {}, - FW fw = {}) -{ - static_assert(ConvDim > 0, "wrong! convolution dim should be larger than 0"); - assert(in.desc.GetNumDims() == ConvDim + 2 and wei.desc.GetNumDims() == ConvDim + 2 and - out.desc.GetNumDims() == ConvDim + 2 and pads.size() == ConvDim and - strides.size() == ConvDim and dilations.size() == ConvDim); - std::size_t out_n_len = out.desc.GetLengths()[0]; - - std::size_t wei_k_len = wei.desc.GetLengths()[0]; - std::size_t wei_c_len = wei.desc.GetLengths()[1]; - - std::size_t vector_len = in.desc.GetVectorLength(); - - std::array in_spatial_len{}; - std::array wei_spatial_len{}; - std::array out_spatial_len{}; - - std::copy_n(in.desc.GetLengths().begin() + 2, ConvDim, in_spatial_len.begin()); - std::copy_n(wei.desc.GetLengths().begin() + 2, ConvDim, wei_spatial_len.begin()); - std::copy_n(out.desc.GetLengths().begin() + 2, ConvDim, out_spatial_len.begin()); - - if(wei.desc.GetLayout_str() == "CHWNc") - { - wei_c_len = wei.desc.GetLengths()[0]; - std::copy_n(wei.desc.GetLengths().begin() + 1, ConvDim, wei_spatial_len.begin()); - wei_k_len = wei.desc.GetLengths()[3]; - } - - std::size_t wei_k_len_per_group = wei_k_len / group_count; - - // f(x0, x1, xs...) - // f1(xs...) = f(x0, x1, xs...) - // f2(xs_array) = f1(xs...) - auto par_ford_out_nk_spatial = miopen::unpacker( - miopen::prepender(miopen::par_ford, out_n_len, wei_k_len))(out_spatial_len); - - par_ford_out_nk_spatial([&](std::size_t out_n_id, - std::size_t out_k_id, - auto... out_spatial_id_pack) { - auto out_spatial_id = make_array(out_spatial_id_pack...); - - std::size_t group_id = out_k_id / wei_k_len_per_group; - Tacc acc = 0; - - miopen::ford(wei_c_len)([&](std::size_t wei_c_id) { - std::size_t in_c_id = group_id * wei_c_len + wei_c_id; - - auto ford_wei_spatial = miopen::unpacker(miopen::ford)(wei_spatial_len); - - ford_wei_spatial([&](auto... wei_spatial_id_pack) { - auto wei_spatial_id = make_array(wei_spatial_id_pack...); - - std::array in_spatial_id{}; - - for(std::size_t i = 0; i < ConvDim; ++i) - { - in_spatial_id[i] = - out_spatial_id[i] * strides[i] + wei_spatial_id[i] * dilations[i] - pads[i]; - } - bool out_of_bound = false; - for(std::size_t i = 0; i < ConvDim; ++i) - { - out_of_bound = out_of_bound or - (in_spatial_id[i] < 0 or in_spatial_id[i] >= in_spatial_len[i]); - } - if(!out_of_bound) - { - if(vector_len > 1) - { - std::array in_id{}; - in_id[1] = out_n_id; - in_id[2] = in_c_id; - std::copy_n(in_spatial_id.begin(), ConvDim, in_id.begin() + 3); - for(std::size_t i = 0; i < vector_len; i++) - { - in_id[0] = i; - acc += Tacc(in(in_id)) * - Tacc(wei(i, out_k_id, wei_c_id, wei_spatial_id_pack...)); - } - } - else - { - std::array in_id{}; - in_id[0] = out_n_id; - in_id[1] = in_c_id; - std::copy_n(in_spatial_id.begin(), ConvDim, in_id.begin() + 2); - Tacc tmp1 = static_cast(fi(in(in_id))); - Tacc tmp2 = - static_cast(fw(wei(out_k_id, wei_c_id, wei_spatial_id_pack...))); - acc += tmp1 * tmp2; - } - } - }); - }); - if(vector_len > 1) - { - out(out_k_id % vector_len, out_n_id, out_k_id / vector_len, out_spatial_id_pack...) = - static_cast(acc); - } - else - { - out(out_n_id, out_k_id, out_spatial_id_pack...) = static_cast(acc); - } - }); -} - -template -void cpu_convolution_backward_data_impl(tensor& in, - const tensor& wei, - const tensor& out, - const Range& pads, - const Range& strides, - const Range& dilations, - std::size_t group_count, - FW fw = {}, - FO fo = {}) -{ - static_assert(ConvDim > 0, "wrong! convolution dim should be larger than 0"); - assert(in.desc.GetNumDims() == ConvDim + 2 and wei.desc.GetNumDims() == ConvDim + 2 and - out.desc.GetNumDims() == ConvDim + 2 and pads.size() == ConvDim and - strides.size() == ConvDim and dilations.size() == ConvDim); - - std::size_t in_n_len = in.desc.GetLengths()[0]; - std::size_t in_c_len = in.desc.GetLengths()[1]; - - std::size_t wei_k_len = wei.desc.GetLengths()[0]; - std::size_t wei_c_len = wei.desc.GetLengths()[1]; - - std::size_t wei_k_len_per_group = wei_k_len / group_count; - - std::array in_spatial_len{}; - std::array wei_spatial_len{}; - std::array out_spatial_len{}; - - std::copy_n(in.desc.GetLengths().begin() + 2, ConvDim, in_spatial_len.begin()); - std::copy_n(wei.desc.GetLengths().begin() + 2, ConvDim, wei_spatial_len.begin()); - std::copy_n(out.desc.GetLengths().begin() + 2, ConvDim, out_spatial_len.begin()); - - auto par_ford_in_nc_spatial = - miopen::unpacker(miopen::prepender(miopen::par_ford, in_n_len, in_c_len))(in_spatial_len); - - par_ford_in_nc_spatial( - [&](std::size_t in_n_id, std::size_t in_c_id, auto... in_spatial_id_pack) { - auto in_spatial_id = make_array(in_spatial_id_pack...); - - std::size_t group_id = in_c_id / wei_c_len; - - Tacc acc = 0; - - miopen::ford(wei_k_len_per_group)([&](std::size_t wei_k_id_inside_group) { - auto ford_wei_spatial = miopen::unpacker(miopen::ford)(wei_spatial_len); - - ford_wei_spatial([&](auto... wei_spatial_id_pack) { - auto wei_spatial_id = make_array(wei_spatial_id_pack...); - - std::array out_spatial_id_{}; - std::array out_spatial_id{}; - - for(std::size_t i = 0; i < ConvDim; ++i) - { - out_spatial_id_[i] = - pads[i] + in_spatial_id[i] - wei_spatial_id[i] * dilations[i]; - out_spatial_id[i] = out_spatial_id_[i] / strides[i]; - } - - bool use = true; - for(std::size_t i = 0; i < ConvDim; ++i) - { - use &= out_spatial_id_[i] % strides[i] == 0 and out_spatial_id[i] >= 0 and - out_spatial_id[i] < out_spatial_len[i]; - } - - if(use) - { - std::size_t out_k_id = - group_id * wei_k_len_per_group + wei_k_id_inside_group; - std::size_t wei_c_id = in_c_id % wei_c_len; - - std::array out_id{}; - out_id[0] = in_n_id; - out_id[1] = out_k_id; - std::copy_n(out_spatial_id.begin(), ConvDim, out_id.begin() + 2); - Tacc tmp1 = fo(out(out_id)); - Tacc tmp2 = fw(wei(out_k_id, wei_c_id, wei_spatial_id_pack...)); - acc += tmp1 * tmp2; - } - }); - }); - // TODO: Why do we need a no-lint here ? - in(in_n_id, in_c_id, in_spatial_id_pack...) = static_cast(acc); // NOLINT - }); -} - -template -void cpu_convolution_backward_weight_impl(const tensor& in, - tensor& wei, - const tensor& out, - const Range& pads, - const Range& strides, - const Range& dilations, - std::size_t group_count, - FI fi, - FO fo) -{ - static_assert(ConvDim > 0, "wrong! convolution dim should be larger than 0"); - assert(in.desc.GetNumDims() == ConvDim + 2 and wei.desc.GetNumDims() == ConvDim + 2 and - out.desc.GetNumDims() == ConvDim + 2 and pads.size() == ConvDim and - strides.size() == ConvDim and dilations.size() == ConvDim); - - std::size_t out_n_len = out.desc.GetLengths()[0]; - - std::size_t wei_k_len = wei.desc.GetLengths()[0]; - std::size_t wei_c_len = wei.desc.GetLengths()[1]; - - std::size_t wei_k_len_per_group = wei_k_len / group_count; - - std::array in_spatial_len{}; - std::array wei_spatial_len{}; - std::array out_spatial_len{}; - - std::copy_n(in.desc.GetLengths().begin() + 2, ConvDim, in_spatial_len.begin()); - std::copy_n(wei.desc.GetLengths().begin() + 2, ConvDim, wei_spatial_len.begin()); - std::copy_n(out.desc.GetLengths().begin() + 2, ConvDim, out_spatial_len.begin()); - - auto par_ford_wei_kc_spatial = miopen::unpacker( - miopen::prepender(miopen::par_ford, wei_k_len, wei_c_len))(wei_spatial_len); - - par_ford_wei_kc_spatial( - [&](std::size_t wei_k_id, std::size_t wei_c_id, auto... wei_spatial_id_pack) { - auto wei_spatial_id = make_array(wei_spatial_id_pack...); - - std::size_t group_id = wei_k_id / wei_k_len_per_group; - std::size_t in_c_id = group_id * wei_c_len + wei_c_id; - - Tacc acc = 0; - - miopen::ford(out_n_len)([&](std::size_t out_n_id) { - auto ford_out_spatial = miopen::unpacker(miopen::ford)(out_spatial_len); - - ford_out_spatial([&](auto... out_spatial_id_pack) { - auto out_spatial_id = make_array(out_spatial_id_pack...); - - std::array in_spatial_id{}; - - for(std::size_t i = 0; i < ConvDim; ++i) - { - in_spatial_id[i] = out_spatial_id[i] * strides[i] + - wei_spatial_id[i] * dilations[i] - pads[i]; - } - - bool out_of_bound = false; - for(std::size_t i = 0; i < ConvDim; ++i) - { - out_of_bound = out_of_bound or (in_spatial_id[i] < 0 or - in_spatial_id[i] >= in_spatial_len[i]); - } - - if(!out_of_bound) - { - std::array in_id{}; - in_id[0] = out_n_id; - in_id[1] = in_c_id; - std::copy_n(in_spatial_id.begin(), ConvDim, in_id.begin() + 2); - Tacc tmp1 = fi(in(in_id)); - Tacc tmp2 = fo(out(out_n_id, wei_k_id, out_spatial_id_pack...)); - acc += tmp1 * tmp2; - } - }); - - wei(wei_k_id, wei_c_id, wei_spatial_id_pack...) = static_cast(acc); - }); - }); -} - -template , - typename FW = PassThru> -void cpu_convolution_forward(std::size_t spatial_dim, - const tensor& in, - const tensor& wei, - tensor& out, - const Range& pads, - const Range& strides, - const Range& dilations, - std::size_t group_count, - FI fi = {}, - FW fw = {}) -{ - switch(spatial_dim) - { - case 1: { - cpu_convolution_forward_impl<1, Tacc>( - in, wei, out, pads, strides, dilations, group_count, fi, fw); - break; - } - case 2: { - cpu_convolution_forward_impl<2, Tacc>( - in, wei, out, pads, strides, dilations, group_count, fi, fw); - break; - } - case 3: { - cpu_convolution_forward_impl<3, Tacc>( - in, wei, out, pads, strides, dilations, group_count, fi, fw); - break; - } - case 4: { - cpu_convolution_forward_impl<4, Tacc>( - in, wei, out, pads, strides, dilations, group_count, fi, fw); - break; - } - default: { - MIOPEN_THROW("not belong to any case"); - } - } -} - -template , - typename FO = PassThru> -void cpu_convolution_backward_data(std::size_t spatial_dim, - tensor& in, - const tensor& wei, - const tensor& out, - const Range& pads, - const Range& strides, - const Range& dilations, - std::size_t group_count, - FW fw = {}, - FO fo = {}) -{ - switch(spatial_dim) - { - case 1: { - cpu_convolution_backward_data_impl<1, Tacc>( - in, wei, out, pads, strides, dilations, group_count, fw, fo); - break; - } - case 2: { - cpu_convolution_backward_data_impl<2, Tacc>( - in, wei, out, pads, strides, dilations, group_count, fw, fo); - break; - } - case 3: { - cpu_convolution_backward_data_impl<3, Tacc>( - in, wei, out, pads, strides, dilations, group_count, fw, fo); - break; - } - case 4: { - cpu_convolution_backward_data_impl<4, Tacc>( - in, wei, out, pads, strides, dilations, group_count, fw, fo); - break; - } - default: { - MIOPEN_THROW("not belong to any case"); - } - } -} - -template , - typename FO = PassThru> -void cpu_convolution_backward_weight(std::size_t spatial_dim, - const tensor& in, - tensor& wei, - const tensor& out, - const Range& pads, - const Range& strides, - const Range& dilations, - std::size_t group_count, - FI fi = {}, - FO fo = {}) -{ - switch(spatial_dim) - { - case 1: { - cpu_convolution_backward_weight_impl<1, Tacc>( - in, wei, out, pads, strides, dilations, group_count, fi, fo); - break; - } - case 2: { - cpu_convolution_backward_weight_impl<2, Tacc>( - in, wei, out, pads, strides, dilations, group_count, fi, fo); - break; - } - case 3: { - cpu_convolution_backward_weight_impl<3, Tacc>( - in, wei, out, pads, strides, dilations, group_count, fi, fo); - break; - } - case 4: { - cpu_convolution_backward_weight_impl<4, Tacc>( - in, wei, out, pads, strides, dilations, group_count, fi, fo); - break; - } - default: { - MIOPEN_THROW("not belong to any case"); - } - } -} -#endif +// Forwarding header — implementation moved to miopen_utils. +#include diff --git a/projects/miopen/test/cpu_layernorm.hpp b/projects/miopen/test/cpu_layernorm.hpp index 8b5bf965deab..a9f7b139484c 100644 --- a/projects/miopen/test/cpu_layernorm.hpp +++ b/projects/miopen/test/cpu_layernorm.hpp @@ -1,216 +1,2 @@ -// Copyright © Advanced Micro Devices, Inc., or its affiliates. -// SPDX-License-Identifier: MIT -#ifndef GUARD_CPU_CONV_HPP -#define GUARD_CPU_CONV_HPP - -#include <../test/tensor_holder.hpp> - -template -void cpu_layernorm_forward(tensor input, - tensor weight, - tensor bias, - tensor& ref_output, - tensor& ref_mean, - tensor& ref_rstd, - float eps, - int32_t dim, - miopenNormMode_t mode, - bool use_multithread = false) -{ - auto layout = input.desc.GetLayoutEnum(); - size_t stride = 1; - if(dim > 1 && layout.has_value() && - (layout.value() == miopenTensorNHWC || layout.value() == miopenTensorNDHWC)) - { - stride = input.desc.GetLengths()[1]; // stride = C - } - - auto dims = input.desc.GetLengths(); - size_t outer_size = 1; - size_t inner_size = 1; - for(size_t i = 0; i < dims.size(); ++i) - { - if(i < dim) - { - if(!(stride > 1 && i == 1)) - { - outer_size *= dims[i]; - } - } - else - { - inner_size *= dims[i]; - } - } - - size_t min_grain = use_multithread ? 8 : outer_size; - miopen::par_for(outer_size, min_grain, [&](int32_t o) { - miopen::ford(stride)([&](int32_t s) { - double mean_v = 0.0; - double var_v = 0.0; - - miopen::ford(inner_size)([&](int32_t i) { - double tmp = static_cast(input[o * inner_size * stride + i * stride + s]); - mean_v += tmp; - var_v += tmp * tmp; - }); - - mean_v = mean_v / inner_size; - var_v = var_v / inner_size - mean_v * mean_v; - double rstd_v = 1.0 / sqrt(var_v + eps); - - ref_mean[o * stride + s] = static_cast(mean_v); - ref_rstd[o * stride + s] = static_cast(rstd_v); - - miopen::ford(inner_size)([&](int32_t i) { - double weight_v = - (mode == MIOPEN_ELEMENTWISE_AFFINE) ? 1.0 : static_cast(weight[i]); - double bias_v = - (mode == MIOPEN_ELEMENTWISE_AFFINE) ? 0.0 : static_cast(bias[i]); - - ref_output[o * inner_size * stride + i * stride + s] = static_cast( - (static_cast(input[o * inner_size * stride + i * stride + s]) - - mean_v) * - rstd_v * weight_v + - bias_v); - }); - }); - }); -} - -template -void cpu_layernorm_backward(tensor dy, - tensor x, - tensor weight, - tensor mean, - tensor rstd, - tensor& ref_dx, - int32_t dim, - miopenNormMode_t mode, - bool use_multithread = false) -{ - auto layout = dy.desc.GetLayoutEnum(); - size_t stride = 1; - if(dim > 1 && (layout == miopenTensorNHWC || layout == miopenTensorNDHWC)) - { - stride = dy.desc.GetLengths()[1]; // stride = C - } - - auto dims = dy.desc.GetLengths(); - size_t outer_size = 1; - size_t inner_size = 1; - for(size_t i = 0; i < dims.size(); ++i) - { - if(i < dim) - { - if(!(stride > 1 && i == 1)) - { - outer_size *= dims[i]; - } - } - else - { - inner_size *= dims[i]; - } - } - - size_t min_grain = use_multithread ? 8 : outer_size; - miopen::par_for(outer_size, min_grain, [&](int32_t o) { - miopen::ford(stride)([&](int32_t s) { - double sum_dy_weight = 0.0; - double sum_dy_weight_x = 0.0; - - miopen::ford(inner_size)([&](int32_t i) { - double pweight = - (mode == MIOPEN_ELEMENTWISE_AFFINE) ? 1.0 : static_cast(weight[i]); - double pdy = (dy.GetSize() != 0) - ? static_cast(dy[o * inner_size * stride + i * stride + s]) - : 0.0; - double px = static_cast(x[o * inner_size * stride + i * stride + s]); - - sum_dy_weight += pdy * pweight; - sum_dy_weight_x += pdy * px * pweight; - }); - - double scale = 1.0 / static_cast(inner_size); - double prstd = static_cast(rstd[o * stride + s]); - double pmean = static_cast(mean[o * stride + s]); - double a = prstd * prstd * prstd * scale * (sum_dy_weight_x - sum_dy_weight * pmean); - double b = prstd * sum_dy_weight * scale - a * pmean; - - miopen::ford(inner_size)([&](int32_t i) { - double pweight = - (mode == MIOPEN_ELEMENTWISE_AFFINE) ? 1.0 : static_cast(weight[i]); - double pdy = (dy.GetSize() != 0) - ? static_cast(dy[o * inner_size * stride + i * stride + s]) - : 0.0; - double val = prstd * pdy * pweight - - a * static_cast(x[o * inner_size * stride + i * stride + s]) - - b; - - ref_dx[o * inner_size * stride + i * stride + s] = static_cast(val); - }); - }); - }); -} - -template -void cpu_layernorm_backward_weight_bias(tensor dy, - tensor x, - tensor mean, - tensor rstd, - tensor& ref_dw, - tensor& ref_db, - int32_t dim, - bool use_multithread = false) -{ - auto layout = dy.desc.GetLayoutEnum(); - size_t stride = 1; - if(dim > 1 && (layout == miopenTensorNHWC || layout == miopenTensorNDHWC)) - { - stride = dy.desc.GetLengths()[1]; // stride = C - } - - auto dims = dy.desc.GetLengths(); - size_t outer_size = 1; - size_t inner_size = 1; - for(size_t i = 0; i < dims.size(); ++i) - { - if(i < dim) - { - if(!(stride > 1 && i == 1)) - { - outer_size *= dims[i]; - } - } - else - { - inner_size *= dims[i]; - } - } - - size_t min_grain = use_multithread ? 8 : inner_size; - miopen::par_for(inner_size, min_grain, [&](int32_t i) { - double sum_dw = 0.0; - double sum_db = 0.0; - - miopen::ford(stride)([&](int32_t s) { - miopen::ford(outer_size)([&](int32_t o) { - double prstd = static_cast(rstd[o * stride + s]); - double pmean = static_cast(mean[o * stride + s]); - double pdy = (dy.GetSize() != 0) - ? static_cast(dy[o * inner_size * stride + i * stride + s]) - : 0; - double px = static_cast(x[o * inner_size * stride + i * stride + s]); - - sum_dw += pdy * (px - pmean) * prstd; - sum_db += pdy; - }); - }); - - ref_dw[i] = sum_dw; - ref_db[i] = sum_db; - }); -} - -#endif +// Forwarding header — implementation moved to miopen_utils. +#include diff --git a/projects/miopen/test/cpu_reduce_util.hpp b/projects/miopen/test/cpu_reduce_util.hpp index 88728b02faec..401dd20b994b 100644 --- a/projects/miopen/test/cpu_reduce_util.hpp +++ b/projects/miopen/test/cpu_reduce_util.hpp @@ -1,649 +1,2 @@ -/******************************************************************************* - * - * MIT License - * - * Copyright (c) 2020 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - *******************************************************************************/ -#ifndef GUARD_CPU_REDUCE_UTIL_HPP -#define GUARD_CPU_REDUCE_UTIL_HPP - -#include "miopen/reducetensor.hpp" -#include "tensor_holder.hpp" -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -namespace reduce { - -template -static inline bool float_equal_one(T); - -static inline bool float_equal_one(float x) { return x == 1.0f; }; - -static inline bool float_equal_one(double x) { return x == 1.0; }; - -static inline bool float_equal_one(half_float::half x) -{ - return x == convert_type(1.0f); -}; - -template -static inline bool float_equal_zero(T x); - -static inline bool float_equal_zero(float x) { return x == 0.0f; }; - -static inline bool float_equal_zero(double x) { return x == 0.0; }; - -static inline bool float_equal_zero(half_float::half x) -{ - return x == convert_type(0.0f); -}; - -template -static inline void build_radix(const std::vector& lens, std::vector& radix) -{ - const std::size_t D = lens.size(); - radix.assign(D, 1); - for(std::size_t d = D; d-- > 1;) - radix[d - 1] = radix[d] * static_cast(lens[d]); // radix[d] = Π_{k>d} lens[k] -} - -// i -> memory offset using lens-radix + actual strides -template -static inline std::size_t linear_to_offset_by_lens_strides(std::size_t i, - const std::vector& lens, - const std::vector& radix, - const std::vector& strides) -{ - std::size_t off = 0; - for(std::size_t d = 0; d < lens.size(); ++d) - { - const std::size_t idx_d = (i / radix[d]) % static_cast(lens[d]); - off += idx_d * static_cast(strides[d]); - } - return off; -} - -template -static inline std::function PreUnaryOpFn(miopenReduceTensorOp_t op_, std::size_t) -{ - using std::abs; - - switch(op_) - { - case MIOPEN_REDUCE_TENSOR_NORM1: return ([&](compType& a_) { a_ = abs(a_); }); - case MIOPEN_REDUCE_TENSOR_NORM2: return ([&](compType& a_) { a_ = a_ * a_; }); - case MIOPEN_REDUCE_TENSOR_AMAX: return ([&](compType& a_) { a_ = abs(a_); }); - - case MIOPEN_REDUCE_TENSOR_AVG: - case MIOPEN_REDUCE_TENSOR_ADD: - case MIOPEN_REDUCE_TENSOR_MUL: - case MIOPEN_REDUCE_TENSOR_MIN: - case MIOPEN_REDUCE_TENSOR_MAX: return ([&](compType&) {}); - } - - throw std::runtime_error(std::string(__FUNCTION__) + - ": using undefined Reduction operation is not permitted"); -}; - -template -static inline std::function PosUnaryOpFn(miopenReduceTensorOp_t op_, - std::size_t divider) -{ - using std::sqrt; - - switch(op_) - { - case MIOPEN_REDUCE_TENSOR_NORM2: return ([&](compType& a_) { a_ = sqrt(a_); }); - - case MIOPEN_REDUCE_TENSOR_AVG: - return ([&, divider](compType& a_) { - a_ = a_ / convert_type(static_cast(divider)); - }); - - case MIOPEN_REDUCE_TENSOR_ADD: - case MIOPEN_REDUCE_TENSOR_NORM1: - case MIOPEN_REDUCE_TENSOR_MUL: - case MIOPEN_REDUCE_TENSOR_MIN: - case MIOPEN_REDUCE_TENSOR_MAX: - case MIOPEN_REDUCE_TENSOR_AMAX: return ([&](compType&) {}); - } - - throw std::runtime_error(std::string(__FUNCTION__) + - ": using undefined Reduction operation is not permitted"); -}; - -template -static inline std::function ReduceOpFn(miopenReduceTensorOp_t op_) -{ - switch(op_) - { - case MIOPEN_REDUCE_TENSOR_ADD: - case MIOPEN_REDUCE_TENSOR_AVG: - case MIOPEN_REDUCE_TENSOR_NORM1: - case MIOPEN_REDUCE_TENSOR_NORM2: return ([&](compType& a_, compType b_) { a_ = a_ + b_; }); - - case MIOPEN_REDUCE_TENSOR_MUL: return ([&](compType& a_, compType b_) { a_ = a_ * b_; }); - - case MIOPEN_REDUCE_TENSOR_MIN: - return ([&](compType& a_, compType b_) { - if(a_ > b_) - a_ = b_; - }); - - case MIOPEN_REDUCE_TENSOR_MAX: - case MIOPEN_REDUCE_TENSOR_AMAX: - return ([&](compType& a_, compType b_) { - if(a_ < b_) - a_ = b_; - }); - } - - throw std::runtime_error(std::string(__FUNCTION__) + - ": using undefined Reduction operation is not permitted"); -}; - -template -static inline std::function -ReduceOpFn2(miopenReduceTensorOp_t op_) -{ - switch(op_) - { - case MIOPEN_REDUCE_TENSOR_MIN: - return ([&](compType& a_, compType b_, bool& changed) { - if(a_ > b_) - { - a_ = b_; - changed = true; - } - else - { - changed = false; - } - }); - - case MIOPEN_REDUCE_TENSOR_MAX: - case MIOPEN_REDUCE_TENSOR_AMAX: - return ([&](compType& a_, compType b_, bool& changed) { - if(a_ < b_) - { - a_ = b_; - changed = true; - } - else - { - changed = false; - } - }); - - case MIOPEN_REDUCE_TENSOR_ADD: - case MIOPEN_REDUCE_TENSOR_MUL: - case MIOPEN_REDUCE_TENSOR_AVG: - case MIOPEN_REDUCE_TENSOR_NORM1: - case MIOPEN_REDUCE_TENSOR_NORM2: return (std::function{}); - }; - - throw std::runtime_error(std::string(__FUNCTION__) + - ": using undefined Reduction operation is not permitted"); -}; - -template -static inline compType ReduceOpZeroVal(miopenReduceTensorOp_t op_) -{ - switch(op_) - { - case MIOPEN_REDUCE_TENSOR_ADD: - case MIOPEN_REDUCE_TENSOR_AVG: - case MIOPEN_REDUCE_TENSOR_NORM1: - case MIOPEN_REDUCE_TENSOR_NORM2: return (convert_type(0.0f)); - - case MIOPEN_REDUCE_TENSOR_MUL: return (convert_type(1.0f)); - - case MIOPEN_REDUCE_TENSOR_MIN: return (std::numeric_limits::max()); - - case MIOPEN_REDUCE_TENSOR_MAX: return (std::numeric_limits::lowest()); - case MIOPEN_REDUCE_TENSOR_AMAX: return (convert_type(0.0f)); - } - - throw std::runtime_error(std::string(__FUNCTION__) + - ": using undefined Reduction operation is not permitted"); -}; - -template -static inline void binop_with_nan_check(miopenNanPropagation_t nanOpt, - reduceOpT&& opReduce, - compType& accuVal, - compType currVal) -{ - using std::isnan; - - if(nanOpt == MIOPEN_NOT_PROPAGATE_NAN) - { - opReduce(accuVal, currVal); - } - else - { - if(isnan(currVal)) - accuVal = currVal; - else - opReduce(accuVal, currVal); - }; -}; - -template -static inline void binop_with_nan_check2(miopenNanPropagation_t nanOpt, - reduceOpT&& opReduce, - compType& accuVal, - compType currVal, - int& accuIndex, - int currIndex) -{ - using std::isnan; - - if(nanOpt == MIOPEN_NOT_PROPAGATE_NAN) - { - bool changed; - - opReduce(accuVal, currVal, changed); - - if(changed) - accuIndex = currIndex; - } - else - { - if(isnan(currVal)) - { - accuVal = currVal; - accuIndex = currIndex; - } - else - { - bool changed; - - opReduce(accuVal, currVal, changed); - - if(changed) - accuIndex = currIndex; - }; - }; -}; - -}; // end of namespace reduce - -template -std::vector> get_all_indexes(const std::vector& lens) -{ - const std::size_t D = lens.size(); - assert(D > 0); - - std::size_t N = 1; - for(const auto L : lens) - N *= static_cast(L); - - std::vector> out; - out.resize(N); - for(auto& row : out) - row.resize(D); - - std::vector stride(D, 1); - for(std::size_t d = D; d-- > 1;) - stride[d - 1] = stride[d] * static_cast(lens[d]); - - for(std::size_t r = 0; r < N; ++r) - { - for(std::size_t d = 0; d < D; ++d) - out[r][d] = static_cast((r / stride[d]) % static_cast(lens[d])); - } - - return out; -} - -template -static inline T -linear_to_offset(size_t li, const std::vector& lens, const std::vector& strides) -{ - T off = 0; - for(int d = int(lens.size()) - 1; d >= 0; --d) - { - const T idx = li % lens[d]; - li /= lens[d]; - off += idx * strides[d]; - } - return off; -} - -template -T get_offset_from_index(const std::vector& strides, const std::vector& index) -{ - T offset = 0; - - assert(strides.size() == index.size()); - - for(int i = 0; i < index.size(); i++) - offset += strides[i] * index[i]; - - return (offset); -}; - -template -T get_flatten_offset(const std::vector& lengths, const std::vector& index) -{ - T offset = 0; - - assert(lengths.size() == index.size() && !lengths.empty()); - - int len = lengths.size(); - T stride = 1; - - // for len==1, the loop is not executed - for(int i = len - 1; i > 0; i--) - { - offset += stride * index[i]; - - stride *= lengths[i]; - }; - - offset += stride * index[0]; - - return (offset); -}; - -template -struct Reducer -{ - compType acc; - bool withIdx; - int idx; // meaningful only when WithIdx==true - miopenNanPropagation_t nanOpt; - // functors for reduction - decltype(reduce::ReduceOpFn(MIOPEN_REDUCE_TENSOR_ADD)) opNoIdx; - decltype(reduce::ReduceOpFn2(MIOPEN_REDUCE_TENSOR_ADD)) opWithIdx; - - Reducer(miopenNanPropagation_t n, miopenReduceTensorOp_t rop, compType zero, bool useIdx) - : acc(zero), - withIdx(useIdx), - idx(0), - nanOpt(n), - opNoIdx(reduce::ReduceOpFn(rop)), - opWithIdx(reduce::ReduceOpFn2(rop)) - { - } - - inline void step(compType v, int flat_i) - { - if(withIdx) - reduce::binop_with_nan_check2(nanOpt, opWithIdx, acc, v, idx, flat_i); - else - reduce::binop_with_nan_check(nanOpt, opNoIdx, acc, v); - } - - inline void combine(const Reducer& other) - { - if(withIdx) - reduce::binop_with_nan_check2(nanOpt, opWithIdx, acc, other.acc, idx, other.idx); - else - reduce::binop_with_nan_check(nanOpt, opNoIdx, acc, other.acc); - } -}; - -template -std::tuple, tensor> reduce_cpu_common(const miopenReduceTensorOp_t& reduceOp, - const miopenNanPropagation_t& nanOpt, - const std::vector& inLengths, - const std::vector& outLengths, - const std::vector& input, - const std::vector& inStrides, - const std::vector& output, - const std::vector& outStrides, - float alpha, - float beta, - bool parallel, - bool withIdx) -{ - using reduce::convert_type; - using reduce::ReduceOpZeroVal; - - // Partition dims - std::vector invariantDims, toReduceDims; - std::vector invLens, redLens, invStrides_v, redStrides_v; - - for(int i = 0; i < static_cast(inLengths.size()); ++i) - { - if(inLengths[i] == outLengths[i]) - { - invariantDims.push_back(i); - invLens.push_back(inLengths[i]); - invStrides_v.push_back(inStrides[i]); - } - else - { - toReduceDims.push_back(i); - redLens.push_back(inLengths[i]); - redStrides_v.push_back(inStrides[i]); - } - } - - const bool reduceAllDims = invariantDims.empty(); - - // unary ops & zero vals - const compType zeroV = ReduceOpZeroVal(reduceOp); - - // divider = Π reduced dims (or N if reduce-all) - std::size_t divider = 1; - if(reduceAllDims) - divider = std::accumulate( - inLengths.begin(), inLengths.end(), std::size_t{1}, std::multiplies<>()); - else - divider = - std::accumulate(redLens.begin(), redLens.end(), std::size_t{1}, std::multiplies<>()); - - auto PreUnaryOp = reduce::PreUnaryOpFn(reduceOp, divider); - auto PosUnaryOp = reduce::PosUnaryOpFn(reduceOp, divider); - - // outputs - auto res = tensor{outLengths}; - res.data = output; - auto res_indices = tensor{outLengths}; - if(withIdx) - std::fill(res_indices.begin(), res_indices.end(), 0); - - if(reduceAllDims) - { - // Flatten whole tensor - const std::size_t N = divider; // product of all dims - std::vector lens_radix; - reduce::build_radix(inLengths, lens_radix); - - // parallel chunking - std::size_t hw = - std::max(std::size_t{1}, static_cast(std::thread::hardware_concurrency())); - const std::size_t P = std::min(N, hw * 4ul); - const std::size_t chunk = (N + P - 1) / P; - - std::vector> partial; - partial.reserve(P); - for(std::size_t p = 0; p < P; ++p) - partial.emplace_back(nanOpt, reduceOp, zeroV, withIdx); - - auto worker = [&](int p) { - const std::size_t begin = std::size_t(p) * chunk; - const std::size_t end = std::min(begin + chunk, N); - - auto& r = partial[p]; - for(std::size_t i = begin; i < end; ++i) - { - const auto off = - reduce::linear_to_offset_by_lens_strides(i, inLengths, lens_radix, inStrides); - auto v = convert_type(input[off]); - PreUnaryOp(v); - r.step(v, static_cast(i)); // flat index across whole tensor - } - }; - - if(parallel) - { - miopen::par_for(static_cast(P), worker); - } - else - { - for(int p = 0; p < P; ++p) - { - worker(p); - } - } - - // combine - Reducer R(nanOpt, reduceOp, zeroV, withIdx); - for(std::size_t p = 0; p < P; ++p) - R.combine(partial[p]); - - // post - PosUnaryOp(R.acc); - if(alpha != 1.0f) - R.acc *= convert_type(alpha); - if(beta != 0.0f) - R.acc += convert_type(output[0]) * convert_type(beta); - - res.data[0] = convert_type(R.acc); - if(withIdx) - res_indices.data[0] = R.idx; - } - else - { - // Build radices for invariant and reduced subspaces - std::vector invRad, redRad; - reduce::build_radix(invLens, invRad); - reduce::build_radix(redLens, redRad); - - const std::size_t INV = - std::accumulate(invLens.begin(), invLens.end(), std::size_t{1}, std::multiplies<>()); - const std::size_t TR = divider; - - std::size_t hw = - std::max(std::size_t{1}, static_cast(std::thread::hardware_concurrency())); - const std::size_t Te = std::min(hw * 4ul, std::max(1, INV)); - const std::size_t chunk = (INV + Te - 1) / Te; - - auto worker = [&](int t) { - const std::size_t row0 = std::size_t(t) * chunk; - const std::size_t row1 = std::min(row0 + chunk, INV); - - for(std::size_t r = row0; r < row1; ++r) - { - // decode invariant multi-index; compute base offsets - std::size_t tmp = r; - std::size_t base_in_off = 0; - std::size_t base_out_off = 0; - for(std::size_t k = 0; k < invLens.size(); ++k) - { - const std::size_t idx = (tmp / invRad[k]) % invLens[k]; - base_in_off += idx * invStrides_v[k]; - base_out_off += idx * outStrides[invariantDims[k]]; - } - - Reducer R(nanOpt, reduceOp, zeroV, withIdx); - - // iterate reduced subspace - for(std::size_t i = 0; i < TR; ++i) - { - std::size_t tmp2 = i; - std::size_t red_off = 0; - for(std::size_t k = 0; k < redLens.size(); ++k) - { - const std::size_t idx = (tmp2 / redRad[k]) % redLens[k]; - red_off += idx * redStrides_v[k]; - } - - auto v = convert_type(input[base_in_off + red_off]); - PreUnaryOp(v); - R.step(v, static_cast(i)); // flat index inside reduced subspace - } - - PosUnaryOp(R.acc); - if(alpha != 1.0f) - R.acc *= convert_type(alpha); - if(beta != 0.0f) - R.acc += - convert_type(output[base_out_off]) * convert_type(beta); - - res.data[base_out_off] = convert_type(R.acc); - if(withIdx) - res_indices.data[base_out_off] = R.idx; - } - }; - - if(parallel) - { - miopen::par_for(static_cast(Te), worker); - } - else - { - for(int te = 0; te < Te; ++te) - { - worker(te); - } - } - } - - return {res, res_indices}; -} - -template -std::tuple, tensor> -reduce_cpu_common(const miopen::ReduceTensorDescriptor& reduceDesc, - const tensor& input, - const tensor& output, - float alpha, - float beta, - bool parallel, - bool withIdx) -{ - auto inLengths = input.desc.GetLengths(); - auto outLengths = output.desc.GetLengths(); - auto inStrides = input.desc.GetStrides(); - auto outStrides = output.desc.GetStrides(); - - const auto reduceOp = reduceDesc.reduceTensorOp_; - const auto nanOpt = reduceDesc.reduceTensorNanOpt_; - - return reduce_cpu_common(reduceOp, - nanOpt, - inLengths, - outLengths, - input.data, - inStrides, - output.data, - outStrides, - alpha, - beta, - parallel, - withIdx); -} - -#endif +// Forwarding header — implementation moved to miopen_utils. +#include diff --git a/projects/miopen/test/fusionHost.hpp b/projects/miopen/test/fusionHost.hpp index 9693295959d7..11c6d54f6257 100644 --- a/projects/miopen/test/fusionHost.hpp +++ b/projects/miopen/test/fusionHost.hpp @@ -1,994 +1,3 @@ -/******************************************************************************* - * - * MIT License - * - * Copyright (c) 2018 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - *******************************************************************************/ -#pragma once -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include +// Forwarding header — implementation moved to miopen_utils. +#include #include "get_handle.hpp" -#include "tensor_holder.hpp" -#include "verify.hpp" - -template -void convHostForward(const tensor& input, - tensor& output, - const tensor& weights, - const int bias_mode, - const tensor& bias, - const miopenConvolutionDescriptor_t convDesc) -{ - - int in_n, in_c, in_h, in_w; - int in_nstride, in_cstride, in_hstride, in_wstride; - std::tie(in_n, in_c, in_h, in_w) = miopen::tien<4>(input.desc.GetLengths()); - std::tie(in_nstride, in_cstride, in_hstride, in_wstride) = - miopen::tien<4>(input.desc.GetStrides()); - - int wei_n, wei_c, wei_h, wei_w; - int wei_nstride, wei_cstride, wei_hstride, wei_wstride; - std::tie(wei_n, wei_c, wei_h, wei_w) = miopen::tien<4>(weights.desc.GetLengths()); - std::tie(wei_nstride, wei_cstride, wei_hstride, wei_wstride) = - miopen::tien<4>(weights.desc.GetStrides()); - - int out_n, out_c, out_h, out_w; - int out_nstride, out_cstride, out_hstride, out_wstride; - std::tie(out_n, out_c, out_h, out_w) = miopen::tien<4>(output.desc.GetLengths()); - std::tie(out_nstride, out_cstride, out_hstride, out_wstride) = - miopen::tien<4>(output.desc.GetStrides()); - - int stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w; - miopenConvolutionMode_t mode; - miopenPaddingMode_t pmode = miopen::deref(convDesc).paddingMode; - miopenGetConvolutionDescriptor( - convDesc, &mode, &pad_h, &pad_w, &stride_h, &stride_w, &dilation_h, &dilation_w); - - if(pmode == miopenPaddingSame) - { - pad_h = (in_h % stride_h == 0) ? (std::max((wei_h - stride_h), 0)) - : (std::max((wei_h - (in_h % stride_h)), 0)); - pad_w = (in_w % stride_w == 0) ? (std::max((wei_w - stride_w), 0)) - : (std::max((wei_w - (in_w % stride_w)), 0)); - pad_h /= 2; - pad_w /= 2; - } - else if(pmode == miopenPaddingValid) - { - pad_h = 0; - pad_w = 0; - } - - if(out_h <= 0 || out_w <= 0) - MIOPEN_THROW("Invalid Test Case: Check Output Dimension."); - - for(int o = 0; o < out_n; o++) - { // mini-batch size - for(int w = 0; w < out_c; w++) - { // out_channels (num filters) - for(int i = 0; i < out_h; i++) - { // output_height (from getforwardoutputdim()) - int in_off_h = i * stride_h; - for(int j = 0; j < out_w; j++) - { // output_width (from getforwardoutputdim()) - /*auto acc = static_cast(0.);*/ - auto acc = static_cast(0.); - int in_off_w = j * stride_w; - for(int k = 0; k < in_c; k++) - { // in_channels (RGB) - for(int x = 0; x < wei_h; x++) - { - int in_x = in_off_h - pad_h + x * dilation_h; - if(in_x >= 0 && in_x < in_h) - { - for(int y = 0; y < wei_w; y++) - { - int in_y = in_off_w - pad_w + y * dilation_w; - if(in_y >= 0 && in_y < in_w) - { - acc += double( - static_cast(input[o * in_nstride + k * in_cstride + - in_x * in_w + in_y]) * - static_cast(weights(w, k, x, y))); - } - } - } - } - } - acc = bias_mode != 0 ? acc + static_cast(bias[w]) : acc; - output[o * out_nstride + w * out_cstride + i * out_hstride + j] = - static_cast(acc); - } - } - } - } -} - -template -void batchNormSpatialHostInference(const tensor& input, - tensor& output, - const tensor& scale, - const tensor& bias, - double epsilon, - const tensor& estimatedMean, - const tensor& estimatedVariance, - bool useInverseVariance = false) -{ - - int n_batches, channels, height, width; - std::tie(n_batches, channels, height, width) = miopen::tien<4>(input.desc.GetLengths()); - miopen::par_for(channels, 1, [&](int cidx) { // via channel - V mean = estimatedMean(0, cidx, 0, 0); - V variance = estimatedVariance(0, cidx, 0, 0); - double invertVar = - useInverseVariance ? static_cast(variance) : 1.0 / sqrt(variance + epsilon); - // process the batch per channel - for(int row = 0; row < height; row++) - { // via rows - for(int column = 0; column < width; column++) - { // via columns - for(int bidx = 0; bidx < n_batches; bidx++) - { // via mini_batch - double elemStd = static_cast(input(bidx, cidx, row, column)) - mean; - double inhat = elemStd * invertVar; - output(bidx, cidx, row, column) = - static_cast(scale(0, cidx, 0, 0) * inhat + bias(0, cidx, 0, 0)); - // printf("output: %f\n",scale(0, cidx, 0, 0) * inhat + bias(0, cidx, 0, 0)); - } - } - } - }); -} - -template -void batchNormPerActivHostInference(const tensor& input, - tensor& output, - const tensor& scale, - const tensor& bias, - double epsilon, - const tensor& estimatedMean, - const tensor& estimatedVariance, - bool useInverseVariance = false) -{ - int n_batches, channels, height, width; - std::tie(n_batches, channels, height, width) = miopen::tien<4>(input.desc.GetLengths()); - miopen::par_for(channels, 1, [&](int cidx) { // via channel - for(int row = 0; row < height; row++) - { // via rows - for(int column = 0; column < width; column++) - { // via columns - // apply down the n_batch dimension - double mean = estimatedMean(0, cidx, row, column); - double variance = estimatedVariance(0, cidx, row, column); - double elemInvVar = useInverseVariance ? variance : 1.0 / sqrt(variance + epsilon); - for(int bidx = 0; bidx < n_batches; bidx++) - { // via mini_batch - // per (x-dims) channel load a block of data into LDS - double elemStd = input(bidx, cidx, row, column) - mean; - double inhat = elemStd * elemInvVar; - output(bidx, cidx, row, column) = - scale(0, cidx, row, column) * inhat + bias(0, cidx, row, column); - // printf("output: %f\n",output(bidx, cidx, row, column)); - } - } - } - }); -} - -template -void batchNormSpatialHostFwdTrain(const tensor& input, - tensor& out, - const tensor& scale, - const tensor& bias, - double epsilon, - double expAvgFactor, - tensor& saveMean, - tensor& saveInvVar, - tensor& runMean, - tensor& runVar) -{ - - int height, width, n_batch, channels; - std::tie(n_batch, channels, height, width) = miopen::tien<4>(input.desc.GetLengths()); - const auto nhw = double(height * width * n_batch); - - miopen::par_for(channels, 1, [&](int cidx) { - double elemStd = 0.; - double variance_accum = 0.; - double mean_accum = 0.; - double invVar = 0.; - double newRunMean = 0.; - double adjust = 0.; - - // process the batch per channel - for(int bidx = 0; bidx < n_batch; bidx++) - { // via mini_batch - for(int row = 0; row < height; row++) - { // via rows - for(int column = 0; column < width; column++) - { // via columns - // #1 calculate the mean - // iterating through the stack of images in the mini_batch - auto inval = static_cast(input(bidx, cidx, row, column)); - mean_accum += inval; - variance_accum += inval * inval; - } // end for (column) - } // end for (row) - } // end for (n) - - mean_accum /= nhw; - variance_accum /= nhw; - variance_accum += (-mean_accum * mean_accum); - invVar = 1.0 / sqrt(variance_accum + epsilon); - - // #4 apply the normalization - // x_hat = (x_i - mean) / sqrt(variance_accum + epsilon) - for(int bidx = 0; bidx < n_batch; bidx++) - { // via mini_batch - for(int row = 0; row < height; row++) - { // via rows - for(int column = 0; column < width; column++) - { // via columns - // #5 Gamma and Beta adjust - // y_i = gamma*x_hat + beta - elemStd = (static_cast(input(bidx, cidx, row, column)) - - mean_accum); // (x_i - mean) - out(bidx, cidx, row, column) = static_cast( - scale(0, cidx, 0, 0) * (invVar * elemStd) + bias(0, cidx, 0, 0)); - } // for (column) - } // for (row) - } // end for(n_batchs) - if(!saveMean.data.empty()) - { - saveMean(0, cidx, 0, 0) = mean_accum; - saveInvVar(0, cidx, 0, 0) = invVar; - } - if(!runMean.data.empty()) - { - newRunMean = runMean(0, cidx, 0, 0) * (1 - expAvgFactor); - runMean(0, cidx, 0, 0) = mean_accum * expAvgFactor + newRunMean; // newMean*factor + tmp - // var(n+1) = p * var(n-1) + (1 - p)*(b/b-1)*var(n) - adjust = (n_batch * height * width == 1) ? variance_accum - : (nhw / (nhw - 1)) * variance_accum; - runVar(0, cidx, 0, 0) = - (1 - expAvgFactor) * runVar(0, cidx, 0, 0) + expAvgFactor * adjust; - } - }); -} - -template -void batchNormSpatialHostBwdTrain(const tensor& x_input, - tensor& dy_input, - tensor& dx_out, - const tensor& bnScale, - const tensor& bnBias, - tensor& dscale, - tensor& dbias, - const tensor& savedMean, - const tensor& savedInvVar, - miopenActivationMode_t activ_mode, - double activ_beta, - double activ_alpha) -{ - double activ_gamma = 0.; - int height, width, n_batch, channels; - std::tie(n_batch, channels, height, width) = miopen::tien<4>(x_input.desc.GetLengths()); - auto nhw = double(height * width * n_batch); - int in_cstride = height * width; - - if(activ_mode > 0) - { - tensor input_norm = - tensor{x_input.desc.GetLayout_t(), x_input.desc.GetLengths()}; - miopen::par_for(channels, 1, [&](int cidx) { - double mean = 0.0; - double invVar = 0.0; - double elemStd = 0.; - double mean_accum = 0.0; - double variance_accum = 0.0; - if(!savedMean.data.empty()) - { - mean = static_cast(savedMean(0, cidx, 0, 0)); // HxW elements - invVar = static_cast(savedInvVar(0, cidx, 0, 0)); // HxW elements - } - else - { - for(int row = 0; row < height; row++) - { // via rows - for(int column = 0; column < width; column++) - { // via columns - for(int bidx = 0; bidx < n_batch; bidx++) - { // via mini_batch - auto inval = static_cast(x_input(bidx, cidx, row, column)); - mean_accum += inval; - variance_accum += inval * inval; - } - } - } - mean_accum /= nhw; - variance_accum /= nhw; - variance_accum += (-mean_accum * mean_accum); - mean = mean_accum; - invVar = 1.0 / sqrt(variance_accum); - } - for(int row = 0; row < height; row++) - { // via rows - for(int column = 0; column < width; column++) - { // via columns - for(int bidx = 0; bidx < n_batch; bidx++) - { // via mini_batch - elemStd = static_cast(x_input(bidx, cidx, row, column)) - - mean; // (x_i - mean) - input_norm(bidx, cidx, row, column) = static_cast( - bnScale(0, cidx, 0, 0) * (elemStd * invVar) + bnBias(0, cidx, 0, 0)); - } - } - } - }); - - activationHostBnormBwd(activ_mode, - activ_gamma, - activ_beta, - activ_alpha, - dy_input.data, - input_norm.data, - dy_input.data); - } - - miopen::par_for(channels, 1, [&](int cidx) { - double elemStd = 0.; - unsigned int xhat_index; - double mean = 0.0; - double invVar = 0.0; - double dyelem = 0.; - std::vector xhat(static_cast(n_batch) * in_cstride, 0.0); - // process the batch per channel - dscale(0, cidx, 0, 0) = 0.; - dbias(0, cidx, 0, 0) = 0.; - - if(!savedMean.data.empty()) - { - - mean = savedMean(0, cidx, 0, 0); // HxW elements - invVar = savedInvVar(0, cidx, 0, 0); // HxW elements - } - else - { - double variance_accum = 0.; - double mean_accum = 0.; - double inv_Var = 0.; - - // process the batch per channel - for(int bidx = 0; bidx < n_batch; bidx++) - { // via mini_batch - for(int row = 0; row < height; row++) - { // via rows - for(int column = 0; column < width; column++) - { // via columns - // #1 calculate the mean - // iterating through the stack of images in the mini_batch - auto inval = static_cast(x_input(bidx, cidx, row, column)); - mean_accum += inval; - variance_accum += inval * inval; - } // end for (column) - } // end for (row) - } // end for (n) - - mean_accum /= nhw; - variance_accum /= nhw; - variance_accum += (-mean_accum * mean_accum); - inv_Var = 1.0 / sqrt(variance_accum); - - mean = mean_accum; - invVar = inv_Var; - } - for(int row = 0; row < height; row++) - { // via rows - for(int column = 0; column < width; column++) - { // via columns - for(int bidx = 0; bidx < n_batch; bidx++) - { // via mini_batch - xhat_index = in_cstride * bidx + (width * row + column); - // per (x-dims) channel load a block of data into LDS - elemStd = static_cast(x_input(bidx, cidx, row, column)) - - mean; // (x_i - mean) - xhat[xhat_index] = elemStd * invVar; - dyelem = static_cast(dy_input(bidx, cidx, row, column)); - dbias(0, cidx, 0, 0) += dyelem; - dscale(0, cidx, 0, 0) += xhat[xhat_index] * dyelem; - } // end for(n_batch) - } // for (column) - } // for (row) - - for(int row = 0; row < height; row++) - { // via rows - for(int column = 0; column < width; column++) - { // via columns - for(int bidx = 0; bidx < n_batch; bidx++) - { // via mini_batch - xhat_index = in_cstride * bidx + (width * row + column); - - double tmp1 = nhw * dy_input(bidx, cidx, row, column) - dbias(0, cidx, 0, 0); - double tmp2 = -xhat[xhat_index] * dscale(0, cidx, 0, 0); - double tmp3 = (bnScale(0, cidx, 0, 0) * invVar) / nhw; - dx_out(bidx, cidx, row, column) = - static_cast(tmp3 * (tmp2 + tmp1)); - } // end for(n_batchs) - } // for (column) - } // for (row) - }); // for (channel) -} - -template -void batchNormActivSpatialHostBwdTrain(miopenActivationMode_t activMode, - double gamma, - double beta, - double alpha, - const tensor& x_input, - const tensor& dy_input, - const tensor& y_input, - tensor& dx_out, - const tensor& bnScale, - const tensor& bias, - tensor& dscale, - tensor& dbias, - const tensor& savedMean, - const tensor& savedInvVar) -{ - - int height, width, n_batch, channels; - std::tie(n_batch, channels, height, width) = miopen::tien<4>(x_input.desc.GetLengths()); - auto nhw = double(height * width * n_batch); - int in_cstride = height * width; - - miopen::par_for(channels, 1, [&](int cidx) { - double elemStd = 0.; - unsigned int xhat_index; - double mean = static_cast(savedMean(0, cidx, 0, 0)); // HxW elements - double invVar = static_cast(savedInvVar(0, cidx, 0, 0)); // HxW elements - double dyelem = 0.; - std::vector xhat(static_cast(n_batch) * in_cstride, 0.0); - // process the batch per channel - dscale(0, cidx, 0, 0) = 0.; - dbias(0, cidx, 0, 0) = 0.; - - for(int row = 0; row < height; row++) - { // via rows - for(int column = 0; column < width; column++) - { // via columns - for(int bidx = 0; bidx < n_batch; bidx++) - { // via mini_batch - - // recompute forward batch norm - xhat_index = in_cstride * bidx + (width * row + column); - // per (x-dims) channel load a block of data into LDS - elemStd = static_cast(x_input(bidx, cidx, row, column)) - - mean; // (x_i - mean) - xhat[xhat_index] = elemStd * invVar; - double bnrefowd = - bnScale(0, cidx, 0, 0) * xhat[xhat_index] + bias(0, cidx, 0, 0); - activationHostBwdElement(activMode, - gamma, - beta, - alpha, - dy_input(bidx, cidx, row, column), - bnrefowd, - y_input(bidx, cidx, row, column), - dyelem); - dbias(0, cidx, 0, 0) += dyelem; - dscale(0, cidx, 0, 0) += xhat[xhat_index] * dyelem; - } // end for(n_batch) - } // for (column) - } // for (row) - - for(int row = 0; row < height; row++) - { // via rows - for(int column = 0; column < width; column++) - { // via columns - for(int bidx = 0; bidx < n_batch; bidx++) - { // via mini_batch - xhat_index = in_cstride * bidx + (width * row + column); - double bnrefowd = - bnScale(0, cidx, 0, 0) * xhat[xhat_index] + bias(0, cidx, 0, 0); - activationHostBwdElement(activMode, - gamma, - beta, - alpha, - dy_input(bidx, cidx, row, column), - bnrefowd, - y_input(bidx, cidx, row, column), - dyelem); - // double tmp1 = nhw * dy_input(bidx, cidx, row, column) - dbias(0, cidx, 0, 0); - double tmp1 = nhw * dyelem - dbias(0, cidx, 0, 0); - double tmp2 = -xhat[xhat_index] * dscale(0, cidx, 0, 0); - double tmp3 = (bnScale(0, cidx, 0, 0) * invVar) / nhw; - dx_out(bidx, cidx, row, column) = static_cast(tmp3 * (tmp2 + tmp1)); - } // end for(n_batchs) - } // for (column) - } // for (row) - }); // for (channel) -} - -template -void batchNormPerActHostFwdTrain(const tensor& input, - tensor& out, - const tensor& scale, - const tensor& bias, - double epsilon, - double expAvgFactor, - tensor& saveMean, - tensor& saveInvVar, - tensor& runMean, - tensor& runVar) -{ - - int height, width, n_batch, channels; - std::tie(n_batch, channels, height, width) = miopen::tien<4>(input.desc.GetLengths()); - const auto n = double(n_batch); - - miopen::par_for(channels, 1, [&](int cidx) { - double mean_accum = 0.; - double variance_accum = 0.; - double elemStd = 0.; - double elemInvVar = 0.; - double inhat = 0.; - double newRunMean = 0.; - double adjust = 0.; - - // process the batch per channel - for(int row = 0; row < height; row++) - { // via rows - for(int column = 0; column < width; column++) - { // via columns - - mean_accum = 0.; - variance_accum = 0.; - for(int bidx = 0; bidx < n_batch; bidx++) - { // via mini_batch - // #1 calculate the mean :: iterating through the stack of images in the - // mini_batch - auto intval = static_cast(input(bidx, cidx, row, column)); - mean_accum += intval; - variance_accum += intval * intval; - } - mean_accum /= n; - variance_accum /= n; - variance_accum = variance_accum - (mean_accum * mean_accum); - elemInvVar = 1.0 / double(sqrt(variance_accum + epsilon)); - - // #4 apply the normalization :: x_hat = (x_i - mean) / sqrt(variance_accum - - // epsilon) - for(int bidx = 0; bidx < n_batch; bidx++) - { // via mini_batch - elemStd = (input(bidx, cidx, row, column) - mean_accum); // (x_i - mean) - inhat = elemStd * elemInvVar; - // #5 Gamma and Beta adjust :: y_i = gamma*x_hat + beta - out(bidx, cidx, row, column) = static_cast( - scale(0, cidx, row, column) * inhat + bias(0, cidx, row, column)); - } // end for(n_batch) - - if(!runMean.data.empty()) - { - newRunMean = runMean(0, cidx, row, column) * (1.0 - expAvgFactor); - runMean(0, cidx, row, column) = - mean_accum * expAvgFactor + newRunMean; // newMean*factor + tmp - } - // var(n+1) = p * var(n-1) + (1 - p)*(b/b-1)*var(n) - if(!runVar.data.empty()) - { - adjust = (n_batch == 1) ? variance_accum : (n / (n - 1.0)) * variance_accum; - runVar(0, cidx, row, column) = - (1 - expAvgFactor) * runVar(0, cidx, row, column) + expAvgFactor * adjust; - } - if(!saveMean.data.empty() || !saveInvVar.data.empty()) - { - saveMean(0, cidx, row, column) = static_cast(mean_accum); - saveInvVar(0, cidx, row, column) = static_cast(elemInvVar); - } - - } // for (column) - } // for (row) - }); -} - -template -void batchNormPerActHostBwdTrain(const tensor& x_input, - const tensor& dy_input, - tensor& dx_out, - const tensor& scale, - tensor& dscale, - tensor& dbias, - const tensor& savedMean, - const tensor& savedInvVar) -{ - - int height, width, n_batch, channels; - std::tie(n_batch, channels, height, width) = miopen::tien<4>(x_input.desc.GetLengths()); - int in_cstride = height * width; - auto n = double(n_batch); - - miopen::par_for(channels, 1, [&](int cidx) { - double elemStd = 0.; - unsigned int xhat_index; - double mean = 0.; - double elemInvVar = 0.; - double dyelem = 0.; - double dxhat = 0.; - double dxhathat = 0.; - double tmp1 = 0.; - std::vector xhat(static_cast(n_batch) * in_cstride); - - // process the batch per channel - for(int row = 0; row < height; row++) - { // via rows - for(int column = 0; column < width; column++) - { // via columns - dxhat = 0.; - dxhathat = 0.; - - if(!savedMean.data.empty()) - { - mean = savedMean(0, cidx, row, column); // HxW elements - elemInvVar = savedInvVar(0, cidx, row, column); // HxW elements - } - else - { - double variance_accum = 0.; - double mean_accum = 0.; - - // process the batch per channel - for(int bidx = 0; bidx < n_batch; bidx++) - { // via mini_batch - auto inval = static_cast(x_input(bidx, cidx, row, column)); - mean_accum += inval; - variance_accum += inval * inval; - } // end for (n) - - mean_accum /= n; - variance_accum /= n; - variance_accum += (-mean_accum * mean_accum); - - mean = mean_accum; - elemInvVar = 1.0 / sqrt(variance_accum); - } - - for(int bidx = 0; bidx < n_batch; bidx++) - { // via mini_batch - xhat_index = in_cstride * bidx + (width * row + column); - // per (x-dims) channel load a block of data into LDS - elemStd = static_cast(x_input(bidx, cidx, row, column)) - - mean; // (x_i - mean) - xhat[xhat_index] = elemStd * elemInvVar; - dyelem = static_cast(dy_input(bidx, cidx, row, column)); - dbias(0, cidx, row, column) += dyelem; - dscale(0, cidx, row, column) += xhat[xhat_index] * dyelem; - tmp1 = scale(0, cidx, row, column) * dyelem; - dxhat += tmp1; - dxhathat += tmp1 * xhat[xhat_index]; - - } // end for(n_batchs) - - for(int bidx = 0; bidx < n_batch; bidx++) - { // via mini_batch - xhat_index = in_cstride * bidx + (width * row + column); - tmp1 = xhat[xhat_index] * dxhathat + dxhat; - double tmp2 = - n_batch * scale(0, cidx, row, column) * dy_input(bidx, cidx, row, column) - - tmp1; - double tmp3 = elemInvVar / (double(n)); - dx_out(bidx, cidx, row, column) = static_cast(tmp3 * tmp2); - } // end for(n_batchs) - } // for (column) - } // for (row) - }); -} - -template -void batchNormActivPerActHostBwdTrain(miopenActivationMode_t activMode, - double gamma, - double beta, - double alpha, - const tensor& x_input, - const tensor& dy_input, - const tensor& y_input, - tensor& dx_out, - const tensor& scale, - const tensor& bias, - tensor& dscale, - tensor& dbias, - const tensor& savedMean, - const tensor& savedInvVar) -{ - - int height, width, n_batch, channels; - std::tie(n_batch, channels, height, width) = miopen::tien<4>(x_input.desc.GetLengths()); - int in_cstride = height * width; - auto n = double(n_batch); - - miopen::par_for(channels, 1, [&](int cidx) { - double elemStd = 0.; - unsigned int xhat_index; - double mean = 0.; - double elemInvVar = 0.; - double dyelem = 0.; - double dxhat = 0.; - double dxhathat = 0.; - double tmp1 = 0.; - std::vector xhat(static_cast(n_batch) * in_cstride); - - // process the batch per channel - for(int row = 0; row < height; row++) - { // via rows - for(int column = 0; column < width; column++) - { // via columns - dxhat = 0.; - dxhathat = 0.; - - mean = savedMean(0, cidx, row, column); // HxW elements - elemInvVar = savedInvVar(0, cidx, row, column); // HxW elements - - for(int bidx = 0; bidx < n_batch; bidx++) - { // via mini_batch - xhat_index = in_cstride * bidx + (width * row + column); - // per (x-dims) channel load a block of data into LDS - elemStd = static_cast(x_input(bidx, cidx, row, column)) - - mean; // (x_i - mean) - xhat[xhat_index] = elemStd * elemInvVar; - double bnrefowd = - scale(0, cidx, row, column) * xhat[xhat_index] + bias(0, cidx, row, column); - activationHostBwdElement(activMode, - gamma, - beta, - alpha, - dy_input(bidx, cidx, row, column), - bnrefowd, - y_input(bidx, cidx, row, column), - dyelem); - /*dyelem = static_cast(dy_input(bidx, cidx, row, column));*/ - dbias(0, cidx, row, column) += dyelem; - dscale(0, cidx, row, column) += xhat[xhat_index] * dyelem; - tmp1 = scale(0, cidx, row, column) * dyelem; - dxhat += tmp1; - dxhathat += tmp1 * xhat[xhat_index]; - - } // end for(n_batchs) - - for(int bidx = 0; bidx < n_batch; bidx++) - { // via mini_batch - xhat_index = in_cstride * bidx + (width * row + column); - tmp1 = xhat[xhat_index] * dxhathat + dxhat; - double bnrefowd = - scale(0, cidx, row, column) * xhat[xhat_index] + bias(0, cidx, row, column); - activationHostBwdElement(activMode, - gamma, - beta, - alpha, - dy_input(bidx, cidx, row, column), - bnrefowd, - y_input(bidx, cidx, row, column), - dyelem); - double tmp2 = (n_batch * scale(0, cidx, row, column) * dyelem) - tmp1; - double tmp3 = elemInvVar / (double(n)); - dx_out(bidx, cidx, row, column) = static_cast(tmp3 * tmp2); - } // end for(n_batchs) - } // for (column) - } // for (row) - }); -} - -template -void visitActivationHostInfer( - miopenActivationMode_t activMode, double gamma, double beta, double alpha, F f) -{ - switch(activMode) - { - case miopenActivationPASTHRU: // x - f([=](double x) { return x; }); - break; - case miopenActivationLOGISTIC: // 1 / (1 + e^-x) //Sigmoid - f([=](double x) { return (1. / (1. + std::exp(-x))); }); - break; - case miopenActivationTANH: // beta * tanh(alpha * x) - f([=](double x) { return (beta * std::tanh(alpha * x)); }); - break; - case miopenActivationRELU: // max(0, x) - f([=](double x) { return ((x > 0.) ? x : 0.); }); - break; - case miopenActivationSOFTRELU: // log(1 + e^x) // bonomial normal log likelihood - f([=](double x) { - return (x > 0.) ? (x + std::log1p(std::exp(-x))) : (std::log1p(std::exp(x))); - }); - break; - case miopenActivationABS: // abs(x) - f([=](double x) { return (std::fabs(x)); }); - break; - case miopenActivationPOWER: // (alpha + beta * x) ^ gamma - f([=](double x) { - auto v = (alpha + beta * x); - return (v <= std::numeric_limits::epsilon()) ? 0. : pow(v, gamma); - }); - break; - case miopenActivationCLIPPEDRELU: // min(alpha, max(0, x)) - f([=](double x) { return (std::min(alpha, std::max(double(0.), x))); }); - break; - case miopenActivationLEAKYRELU: // alpha * x | x<=0; x | x>0 - f([=](double x) { return ((x > 0.) ? x : x * alpha); }); - break; - case miopenActivationELU: // alpha * (exp(x)-1) | x<=0; x | x>0 - f([=](double x) { return ((x > 0.) ? x : alpha * std::expm1(x)); }); - break; - case miopenActivationCLAMP: // max(alpha, min(beta, x)) - f([=](double x) { return (std::max(alpha, std::min(beta, x))); }); - break; - // default: printf("ERROR: unknown neuron type: %d\n", activMode); break; - } -} - -template -inline void activationHostInfer(miopenActivationMode_t activMode, - double gamma, - double beta, - double alpha, - const std::vector input, - std::vector& output) -{ - visitActivationHostInfer(activMode, gamma, beta, alpha, [&](auto f) { - miopen::par_for(input.size(), 1, [&](int index) { - output[index] = static_cast(f(static_cast(input[index]))); - }); - }); -} - -template -void visitActivationHostBwd( - miopenActivationMode_t activMode, double gamma, double beta, double alpha, F f) -{ - switch(activMode) - { - case miopenActivationPASTHRU: // x - f([=](double dy, double, double) { return dy; }); - break; - case miopenActivationLOGISTIC: // 1 / (1 + e^-x) //Sigmoid - f([=](double dy, double, double y) { return dy * y * (1 - y); }); - break; - case miopenActivationTANH: // beta * tanh(alpha * x) - f([=](double dy, double, double y) { return dy * alpha * (beta - y * y / beta); }); - break; - case miopenActivationRELU: // max(0, x) - f([=](double dy, double x, double) { return (x > 0) ? dy : 0; }); - break; - case miopenActivationSOFTRELU: // log(1 + e^x) // bonomial normal log likelihood - f([=](double dy, double x, double) { - static const double threshold = 50.; - double expval = std::exp(std::min(x, threshold)); - return dy * expval / (expval + 1.0); - }); - break; - case miopenActivationABS: // abs(x) - f([=](double dy, double x, double) { return dy * ((x > 0) ? 1 : -1); }); - break; - case miopenActivationPOWER: // (alpha + beta * x) ^ gamma - f([=](double, double x, double y) { - auto v = alpha + beta * x; - return v <= std::numeric_limits::epsilon() ? 0 : gamma * beta * y / v; - }); - break; - case miopenActivationCLIPPEDRELU: // min(alpha, max(0, x)) - f([=](double dy, double x, double) { return (x > 0 && x <= alpha) ? dy : 0; }); - break; - case miopenActivationLEAKYRELU: // alpha * x | x<=0; x | x>0 - f([=](double dy, double x, double) { return dy * ((x > 0) ? 1 : alpha); }); - break; - case miopenActivationELU: // alpah * (exp(x)-1) | x<=0; x | x>0 - f([=](double dy, double x, double y) { return dy * ((x > 0) ? 1 : y + alpha); }); - break; - case miopenActivationCLAMP: // max(alpha, min(beta, x)) - f([=](double dy, double x, double) { return (x > alpha && x <= beta) ? dy : 0; }); - break; - // default: printf("ERROR: unknown neuron type: %d\n", activMode); break; - } -} - -template -inline void activationHostBnormBwd(miopenActivationMode_t activMode, - double gamma, - double beta, - double alpha, - const std::vector dyinput, - const std::vector xinput, - std::vector& output) -{ - double dummy; - visitActivationHostBwd(activMode, gamma, beta, alpha, [&](auto f) { - miopen::par_for(dyinput.size(), 1, [&](int index) { - output[index] = static_cast( - f(static_cast(dyinput[index]), static_cast(xinput[index]), dummy)); - }); - }); -} - -template -inline void activationHostBwd(miopenActivationMode_t activMode, - double gamma, - double beta, - double alpha, - const std::vector dyinput, - const std::vector xinput, - const std::vector yinput, - std::vector& output) -{ - visitActivationHostBwd(activMode, gamma, beta, alpha, [&](auto f) { - miopen::par_for(dyinput.size(), 1, [&](int index) { - output[index] = static_cast(f(static_cast(dyinput[index]), - static_cast(xinput[index]), - static_cast(yinput[index]))); - }); - }); -} - -inline void activationHostBwdElement(miopenActivationMode_t activMode, - double gamma, - double beta, - double alpha, - const double dyinput, - const double xinput, - const double yinput, - double& output) -{ - visitActivationHostBwd(activMode, gamma, beta, alpha, [&](auto f) { - output = static_cast(f(dyinput, xinput, yinput)); - }); -} - -template -tensor get_output_tensor(const miopen::ConvolutionDescriptor& filter, - const tensor& input, - const tensor& weights) -{ - return tensor{filter.GetForwardOutputTensor(input.desc, weights.desc, miopen_type{})}; -} diff --git a/projects/miopen/test/gemm.hpp b/projects/miopen/test/gemm.hpp index 81c38db0fdf3..be0195545352 100644 --- a/projects/miopen/test/gemm.hpp +++ b/projects/miopen/test/gemm.hpp @@ -1,120 +1,2 @@ -/******************************************************************************* - * - * MIT License - * - * Copyright (c) 2017 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - *******************************************************************************/ -#ifndef GUARD_GEMM_HPP -#define GUARD_GEMM_HPP - -#include -#include -#include - -/* - A and B rows and cols should be passed as default values (NxM, MxK), independently of - a_transponse/b_transpose flag value - C rows and cols should have correct values based on a_transponse/b_transpose values - A, B, C strides should have corret values based on a_transponse/b_transpose values -*/ -template -void gemm_cpu(const Dtype* a_ptr, - const size_t a_cols, - const size_t a_rows, - const size_t a_stride, - const bool a_transpose, - const Dtype* b_ptr, - const size_t b_cols, - const size_t b_rows, - const size_t b_stride, - const bool b_transpose, - Dtype* c_ptr, - const size_t c_cols, - const size_t c_rows, - const size_t c_stride, - double alpha = 1.0, - double beta = 1.0) -{ - if((!a_transpose && !b_transpose && - ((a_cols != b_rows) || (a_rows != c_rows) || (b_cols != c_cols))) || - (a_transpose && b_transpose && - ((a_rows != b_cols) || (a_cols != c_rows) || (b_rows != c_cols))) || - (a_transpose && !b_transpose && - ((a_rows != b_rows) || (a_cols != c_rows) || (b_cols != c_cols))) || - (!a_transpose && b_transpose && - ((a_cols != b_cols) || (a_rows != c_rows) || (b_rows != c_cols)))) - { - MIOPEN_THROW("MM_CPU_ERROR. Incompatible matrix size:\nA: " + std::to_string(a_rows) + "x" + - std::to_string(a_cols) + " transpose: " + (a_transpose ? "true" : "false") + - "\nB: " + std::to_string(b_rows) + "x" + std::to_string(b_cols) + - " transpose: " + (b_transpose ? "true" : "false") + - "\nC: " + std::to_string(c_rows) + "x" + std::to_string(c_cols) + "\n"); - } - - size_t inner_loop_limit = a_transpose ? a_rows : a_cols; - auto inner_loop = [&](int m, int n) { - double el = 0.0; - if(!a_transpose && !b_transpose) - { - miopen::ford(inner_loop_limit)([&](int k) { - el += static_cast(a_ptr[m * a_stride + k]) * - static_cast(b_ptr[k * b_stride + n]); - }); - } - else if(!a_transpose && b_transpose) - { - miopen::ford(inner_loop_limit)([&](int k) { - el += static_cast(a_ptr[m * a_stride + k]) * - static_cast(b_ptr[n * b_stride + k]); - }); - } - else if(a_transpose && !b_transpose) - { - miopen::ford(inner_loop_limit)([&](int k) { - el += static_cast(a_ptr[k * a_stride + m]) * - static_cast(b_ptr[k * b_stride + n]); - }); - } - else - { - miopen::ford(inner_loop_limit)([&](int k) { - el += static_cast(a_ptr[k * a_stride + m]) * - static_cast(b_ptr[n * b_stride + k]); - }); - } - - c_ptr[m * c_stride + n] = - static_cast(beta * static_cast(c_ptr[m * c_stride + n]) + alpha * el); - }; - - constexpr size_t iter_margin = 1'048'576; // 2^20 - if(c_rows * c_cols * inner_loop_limit > iter_margin) - { - miopen::par_ford(c_rows, c_cols)(inner_loop); - } - else - { - miopen::ford(c_rows, c_cols)(inner_loop); - } -} - -#endif +// Forwarding header — implementation moved to miopen_utils. +#include diff --git a/projects/miopen/test/gtest/CMakeLists.txt b/projects/miopen/test/gtest/CMakeLists.txt index af74113fa312..dfdb6ef4630e 100644 --- a/projects/miopen/test/gtest/CMakeLists.txt +++ b/projects/miopen/test/gtest/CMakeLists.txt @@ -81,7 +81,7 @@ function(add_gtest TEST_NAME TEST_CPP) # Workaround : change in rocm-cmake was causing linking error so had to add ${CMAKE_DL_LIBS} # We can remove ${CMAKE_DL_LIBS} once root cause is identified. # MIOpen_with_plugins ensures CK plugin .so's are built alongside the test - target_link_libraries(${TEST_NAME} ${CMAKE_DL_LIBS} GTest::gtest GTest::gtest_main GTest::gmock MIOpen_with_plugins hip::host ) + target_link_libraries(${TEST_NAME} ${CMAKE_DL_LIBS} GTest::gtest GTest::gtest_main GTest::gmock MIOpen_with_plugins hip::host miopen_common_utils miopen_utils) if(NOT MIOPEN_EMBED_DB STREQUAL "") target_link_libraries(${TEST_NAME} $) endif() @@ -211,7 +211,7 @@ endforeach() # Otherwise, all files in ${SOURCES} are rebuilt for each test. add_library(miopen_gtest_common STATIC ${SOURCES}) target_include_directories(miopen_gtest_common PRIVATE ../ ../../src/kernels) -target_link_libraries(miopen_gtest_common PRIVATE GTest::gtest GTest::gmock MIOpen) +target_link_libraries(miopen_gtest_common PRIVATE GTest::gtest GTest::gmock MIOpen miopen_common_utils miopen_utils) if(WIN32) # Refer to https://en.cppreference.com/w/cpp/language/types for details. target_compile_options(miopen_gtest_common PRIVATE $:-U__LP64__>>) diff --git a/projects/miopen/test/gtest/adam.hpp b/projects/miopen/test/gtest/adam.hpp index 0efd9b390765..e54ddd1fc85d 100644 --- a/projects/miopen/test/gtest/adam.hpp +++ b/projects/miopen/test/gtest/adam.hpp @@ -24,7 +24,6 @@ * *******************************************************************************/ #define MIOPEN_BETA_API 1 -#include "../driver/tensor_driver.hpp" #include "cpu_adam.hpp" #include "get_handle.hpp" #include "random.hpp" diff --git a/projects/miopen/test/gtest/addlayernorm.hpp b/projects/miopen/test/gtest/addlayernorm.hpp index 0eba1588058d..511882710ff8 100644 --- a/projects/miopen/test/gtest/addlayernorm.hpp +++ b/projects/miopen/test/gtest/addlayernorm.hpp @@ -24,7 +24,6 @@ * *******************************************************************************/ -#include "../driver/tensor_driver.hpp" #include "get_handle.hpp" #include "random.hpp" #include "tensor_holder.hpp" diff --git a/projects/miopen/test/gtest/cat.hpp b/projects/miopen/test/gtest/cat.hpp index 8d5fb109e0ea..bf29ccc7bcb0 100644 --- a/projects/miopen/test/gtest/cat.hpp +++ b/projects/miopen/test/gtest/cat.hpp @@ -2,7 +2,6 @@ // SPDX-License-Identifier: MIT #define MIOPEN_BETA_API 1 -#include "../driver/tensor_driver.hpp" #include "cpu_cat.hpp" #include "get_handle.hpp" #include "random.hpp" diff --git a/projects/miopen/test/gtest/conv3d_test_case.hpp b/projects/miopen/test/gtest/conv3d_test_case.hpp index a10c1809cacf..d9a061941703 100644 --- a/projects/miopen/test/gtest/conv3d_test_case.hpp +++ b/projects/miopen/test/gtest/conv3d_test_case.hpp @@ -30,7 +30,6 @@ #include "get_handle.hpp" #include -#include "../driver/tensor_driver.hpp" #include "conv_common.hpp" #include "conv_test_base.hpp" diff --git a/projects/miopen/test/gtest/find_mode_trust_verify.cpp b/projects/miopen/test/gtest/find_mode_trust_verify.cpp index 021a593f3372..178b1edff149 100644 --- a/projects/miopen/test/gtest/find_mode_trust_verify.cpp +++ b/projects/miopen/test/gtest/find_mode_trust_verify.cpp @@ -26,7 +26,7 @@ #include #include -#include "../../driver/driver.hpp" +#include namespace miopen { std::vector diff --git a/projects/miopen/test/gtest/getitem.hpp b/projects/miopen/test/gtest/getitem.hpp index 22c98ca67b99..8889b1d3d457 100644 --- a/projects/miopen/test/gtest/getitem.hpp +++ b/projects/miopen/test/gtest/getitem.hpp @@ -24,7 +24,6 @@ * *******************************************************************************/ -#include "../driver/tensor_driver.hpp" #include "get_handle.hpp" #include "random.hpp" #include "tensor_holder.hpp" diff --git a/projects/miopen/test/gtest/group_conv.hpp b/projects/miopen/test/gtest/group_conv.hpp index d9ab9e080898..8acdd56548e2 100644 --- a/projects/miopen/test/gtest/group_conv.hpp +++ b/projects/miopen/test/gtest/group_conv.hpp @@ -32,7 +32,6 @@ #include #include -#include "../driver/tensor_driver.hpp" #include "conv_common.hpp" #include "gtest_common.hpp" diff --git a/projects/miopen/test/gtest/group_conv_deterministic_split_k.cpp b/projects/miopen/test/gtest/group_conv_deterministic_split_k.cpp index 3e141b72057e..7f9c62901733 100644 --- a/projects/miopen/test/gtest/group_conv_deterministic_split_k.cpp +++ b/projects/miopen/test/gtest/group_conv_deterministic_split_k.cpp @@ -30,7 +30,6 @@ #include #include "../random.hpp" #include "get_handle.hpp" -#include "../driver/tensor_driver.hpp" #include "conv_common.hpp" #include "gtest_common.hpp" diff --git a/projects/miopen/test/gtest/groupnorm.hpp b/projects/miopen/test/gtest/groupnorm.hpp index 33c4ed105f59..e28c5b652605 100644 --- a/projects/miopen/test/gtest/groupnorm.hpp +++ b/projects/miopen/test/gtest/groupnorm.hpp @@ -31,7 +31,6 @@ #include "cpu_groupnorm.hpp" #include "get_handle.hpp" #include "random.hpp" -#include "../driver/tensor_driver.hpp" #include "verify.hpp" #include diff --git a/projects/miopen/test/gtest/kernel_tuning_net.cpp b/projects/miopen/test/gtest/kernel_tuning_net.cpp index 304adb9800d4..760a099b2ef4 100644 --- a/projects/miopen/test/gtest/kernel_tuning_net.cpp +++ b/projects/miopen/test/gtest/kernel_tuning_net.cpp @@ -30,7 +30,7 @@ #include #include #include -#include "../../driver/driver.hpp" +#include struct KernelTuningNetTestCase : AIModelTestCase { diff --git a/projects/miopen/test/gtest/kthvalue.hpp b/projects/miopen/test/gtest/kthvalue.hpp index 2aa7e6fd41d1..58d7db388419 100644 --- a/projects/miopen/test/gtest/kthvalue.hpp +++ b/projects/miopen/test/gtest/kthvalue.hpp @@ -23,7 +23,6 @@ * SOFTWARE. * *******************************************************************************/ -#include "../driver/tensor_driver.hpp" #include "cpu_kthvalue.hpp" #include "get_handle.hpp" diff --git a/projects/miopen/test/gtest/layout_transpose.cpp b/projects/miopen/test/gtest/layout_transpose.cpp index f67c7a0387de..b688d17b2aa7 100644 --- a/projects/miopen/test/gtest/layout_transpose.cpp +++ b/projects/miopen/test/gtest/layout_transpose.cpp @@ -25,7 +25,6 @@ *******************************************************************************/ #include -#include "../../driver/conv_common.hpp" #include #include #include @@ -38,6 +37,8 @@ #include +using float16 = half_float::half; + namespace { template diff --git a/projects/miopen/test/gtest/reducecalculation.hpp b/projects/miopen/test/gtest/reducecalculation.hpp index 2f2867423d5f..3b2de8465c0c 100644 --- a/projects/miopen/test/gtest/reducecalculation.hpp +++ b/projects/miopen/test/gtest/reducecalculation.hpp @@ -24,14 +24,13 @@ * *******************************************************************************/ -#include "../driver/tensor_driver.hpp" +#include #include "../src/kernels/MIOpenReduceCalculation.hpp" #include "get_handle.hpp" #include "random.hpp" #include "tensor_holder.hpp" #include "verify.hpp" #include -#include #include template diff --git a/projects/miopen/test/gtest/reduceextreme.hpp b/projects/miopen/test/gtest/reduceextreme.hpp index f884bb8fc5cf..0c2cde8c7564 100644 --- a/projects/miopen/test/gtest/reduceextreme.hpp +++ b/projects/miopen/test/gtest/reduceextreme.hpp @@ -24,7 +24,7 @@ * *******************************************************************************/ -#include "../driver/tensor_driver.hpp" +#include #include "../src/kernels/MIOpenReduceExtreme.hpp" #include "get_handle.hpp" #include "random.hpp" @@ -32,7 +32,6 @@ #include "verify.hpp" #include #include -#include template bool compare_equal(T r1, T r2) diff --git a/projects/miopen/test/gtest/rope.hpp b/projects/miopen/test/gtest/rope.hpp index 8c8dd2ed2b3d..109ff0549978 100644 --- a/projects/miopen/test/gtest/rope.hpp +++ b/projects/miopen/test/gtest/rope.hpp @@ -24,7 +24,6 @@ * *******************************************************************************/ -#include "../driver/tensor_driver.hpp" #include "get_handle.hpp" #include "random.hpp" #include "tensor_holder.hpp" diff --git a/projects/miopen/test/gtest/softmax_find20.cpp b/projects/miopen/test/gtest/softmax_find20.cpp index 094a432d4521..84dd758f4d30 100644 --- a/projects/miopen/test/gtest/softmax_find20.cpp +++ b/projects/miopen/test/gtest/softmax_find20.cpp @@ -28,7 +28,7 @@ #include "test.hpp" #include "get_handle.hpp" #include "tensor_holder.hpp" -#include "../driver/mloSoftmaxHost.hpp" +#include #include "verify.hpp" #include diff --git a/projects/miopen/test/gtest/t5layernorm.hpp b/projects/miopen/test/gtest/t5layernorm.hpp index 1ee2f2bd6ebe..e71819273683 100644 --- a/projects/miopen/test/gtest/t5layernorm.hpp +++ b/projects/miopen/test/gtest/t5layernorm.hpp @@ -24,7 +24,6 @@ * *******************************************************************************/ -#include "../driver/tensor_driver.hpp" #include "get_handle.hpp" #include "random.hpp" #include "tensor_holder.hpp" diff --git a/projects/miopen/test/gtest/transformers_adam_w.hpp b/projects/miopen/test/gtest/transformers_adam_w.hpp index d2a804841258..ef465fc98854 100644 --- a/projects/miopen/test/gtest/transformers_adam_w.hpp +++ b/projects/miopen/test/gtest/transformers_adam_w.hpp @@ -24,7 +24,6 @@ * *******************************************************************************/ #define MIOPEN_BETA_API 1 -#include "../driver/tensor_driver.hpp" #include "cpu_transformers_adam_w.hpp" #include "get_handle.hpp" #include "random.hpp" diff --git a/projects/miopen/test/network_data.hpp b/projects/miopen/test/network_data.hpp index 987d4dda9929..18e85973ef3f 100644 --- a/projects/miopen/test/network_data.hpp +++ b/projects/miopen/test/network_data.hpp @@ -1,438 +1,2 @@ -/******************************************************************************* - * - * MIT License - * - * Copyright (c) 2017 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - *******************************************************************************/ - -#ifndef GUARD_MIOPEN_TEST_NETWORK_DATA_HPP -#define GUARD_MIOPEN_TEST_NETWORK_DATA_HPP - -#include -#include -#include -#include - -#ifndef MIOPEN_TEST_DEFAULT_BATCH_SIZE_FACTOR -#define MIOPEN_TEST_DEFAULT_BATCH_SIZE_FACTOR 0 -#endif - -template -inline constexpr T pick_batch_size(T x, T y) -{ - return (y == 0 || y > x) ? 1 : x / y; -} - -// Reduce tests execution time -#define MIOPEN_TESTS_GET_INPUTS_ENABLE_HUGE_TENSORS 1 - -template -inline std::set> get_inputs(T n = MIOPEN_TEST_DEFAULT_BATCH_SIZE_FACTOR) -{ - // clang-format off - return - { - { pick_batch_size(32, n), 1, 14, 14 }, - { pick_batch_size(100, n), 1, 8, 8 }, - { pick_batch_size(256, n), 1, 27, 27 }, -#if MIOPEN_TESTS_GET_INPUTS_ENABLE_HUGE_TENSORS - { pick_batch_size(64, n), 19, 1024,2048}, -#endif - { pick_batch_size(100, n), 3, 32, 32 }, - { pick_batch_size(100, n), 32, 16, 16 }, - { pick_batch_size(100, n), 32, 8, 8 }, - { pick_batch_size(128, n), 256, 12, 12 }, - { pick_batch_size(128, n), 3, 231, 231 }, - { pick_batch_size(128, n), 512, 12, 12 }, - { pick_batch_size(256, n), 256, 13, 13 }, - { pick_batch_size(256, n), 3, 227, 227 }, - { pick_batch_size(256, n), 384, 13, 13 }, - { pick_batch_size(256, n), 96, 27, 27 }, - { pick_batch_size(32, n), 128, 28, 28 }, - { pick_batch_size(32, n), 144, 14, 14 }, - { pick_batch_size(32, n), 192, 28, 28 }, - { pick_batch_size(32, n), 192, 7, 7 }, - { pick_batch_size(32, n), 256, 28, 28 }, - { pick_batch_size(32, n), 3, 224, 224 }, - { pick_batch_size(32, n), 32, 28, 28 }, - { pick_batch_size(32, n), 48, 7, 7 }, - { pick_batch_size(32, n), 480, 128, 256 }, - { pick_batch_size(32, n), 480, 64, 128 }, - { pick_batch_size(32, n), 512, 4, 4 }, - { pick_batch_size(32, n), 512, 64, 128 }, - { pick_batch_size(16, n), 64, 56, 56 }, - { pick_batch_size(32, n), 832, 7, 7 }, - { pick_batch_size(64, n), 128, 56, 56 }, - { pick_batch_size(64, n), 256, 28, 28 }, - { pick_batch_size(64, n), 3, 224, 224 }, - { pick_batch_size(64, n), 512, 28, 28 }, - { pick_batch_size(64, n), 64, 112, 112 }, - { pick_batch_size(32, n), 64, 14, 14 }, - { pick_batch_size(32, n), 192, 14, 14 }, - { pick_batch_size(32, n), 320, 28, 28 }, - { pick_batch_size(32, n), 576, 14, 14 }, - { pick_batch_size(32, n), 576, 4, 4 }, - { pick_batch_size(32, n), 1056, 7, 7 }, - { pick_batch_size(32, n), 2048, 11, 11 }, -#if MIOPEN_TESTS_GET_INPUTS_ENABLE_HUGE_TENSORS - { pick_batch_size(32, n), 16, 2048, 2048 }, - { pick_batch_size(32, n), 16, 3072, 3072 }, - { pick_batch_size(32, n), 16, 4096, 4096 }, -#endif - { 1, 1, 1, 1 } - }; - // clang-format on -} - -template -inline std::set> get_weights(T n = MIOPEN_TEST_DEFAULT_BATCH_SIZE_FACTOR) -{ - // clang-format off - return - { - { pick_batch_size(1024, n),1024, 3, 3 }, - { pick_batch_size(1024, n),512, 3, 3 }, - { pick_batch_size(128, n), 256, 1, 1 }, - { pick_batch_size(128, n), 528, 1, 1 }, - { pick_batch_size(128, n), 96, 3, 3 }, - { pick_batch_size(16, n), 192, 1, 1 }, - { pick_batch_size(224, n), 112, 3, 3 }, - { pick_batch_size(256, n), 96, 5, 5 }, - { pick_batch_size(288, n), 144, 3, 3 }, - { pick_batch_size(48, n), 832, 1, 1 }, - { pick_batch_size(512, n), 256, 3, 3 }, - { pick_batch_size(64, n), 1, 2, 2 }, - { pick_batch_size(64, n), 3, 3, 3 }, - { pick_batch_size(64, n), 3, 7, 7 }, - { pick_batch_size(64, n), 32, 5, 5 }, - { pick_batch_size(64, n), 480, 1, 1 }, - { pick_batch_size(64, n), 64, 1, 1 }, - { pick_batch_size(96, n), 3, 11, 11 }, - { pick_batch_size(192, n), 64, 5, 5 }, - { pick_batch_size(64, n), 64, 3, 3 }, - { pick_batch_size(224, n), 224, 3, 3 }, - { pick_batch_size(224, n), 192, 3, 3 }, - { pick_batch_size(128, n), 320, 1, 1 }, - { pick_batch_size(192, n), 576, 1, 1 }, - { pick_batch_size(128, n), 1056, 1, 1 }, - { pick_batch_size(128, n), 1024, 1, 1 }, - { pick_batch_size(512, n), 2048, 1, 1 } - }; - // clang-format on -} - -template -inline std::set> get_immed_inputs(T n = MIOPEN_TEST_DEFAULT_BATCH_SIZE_FACTOR) -{ - // clang-format off - return - { - { pick_batch_size(32, n), 1, 14, 14 }, - { pick_batch_size(256, n), 1, 27, 27 }, - { pick_batch_size(128, n), 512, 12, 12 }, - { pick_batch_size(256, n), 256, 13, 13 }, - { pick_batch_size(256, n), 3, 227, 227 }, - { pick_batch_size(32, n), 64, 56, 56 }, - { pick_batch_size(32, n), 96, 14, 14 }, - { pick_batch_size(32, n), 96, 28, 28 }, - { pick_batch_size(64, n), 128, 56, 56 }, - { pick_batch_size(64, n), 3, 224, 224 }, - { pick_batch_size(64, n), 256, 14, 14 }, - { 1, 1, 1, 1 } - }; - // clang-format on -} - -template -inline std::set> get_immed_weights(T n = MIOPEN_TEST_DEFAULT_BATCH_SIZE_FACTOR) -{ - // clang-format off - return - { - { pick_batch_size(208, n), 96, 3, 3 }, - { pick_batch_size(24, n), 512, 1, 1 }, - { pick_batch_size(256, n), 128, 3, 3 }, - { pick_batch_size(256, n), 256, 3, 3 }, - { pick_batch_size(256, n), 64, 5, 5 }, - { pick_batch_size(288, n), 144, 3, 3 }, - { pick_batch_size(96, n), 3, 11, 11 }, - { pick_batch_size(32, n), 128, 5, 5 }, - { pick_batch_size(32, n), 128, 1, 1 }, - { pick_batch_size(256, n), 256, 3, 3 }, - { pick_batch_size(512, n), 512, 3, 3 }, - { pick_batch_size(160, n), 128, 3, 3 }, - { pick_batch_size(32, n), 3, 7, 7 } - }; - // clang-format on -} - -template -inline std::set> -get_3d_conv_input_shapes(T n = MIOPEN_TEST_DEFAULT_BATCH_SIZE_FACTOR) -{ - // clang-format off - return - { - { pick_batch_size(128, n), 1, 1, 2, 2}, - { pick_batch_size(128, n), 64, 1, 1, 1}, - { pick_batch_size(128, n), 64, 3, 4, 4}, - { pick_batch_size(352, n), 32, 4, 9, 9}, - { pick_batch_size(192, n), 512, 3, 14, 14}, - { pick_batch_size(352, n), 512, 4, 28, 28}, - { pick_batch_size(256, n), 512, 4, 56, 56}, - { pick_batch_size(192, n), 3, 4, 227, 227}, - { pick_batch_size(128, n), 4, 4, 161, 700} - }; - // clang-format on -} - -template -inline std::set> -get_3d_conv_weight_shapes(T n = MIOPEN_TEST_DEFAULT_BATCH_SIZE_FACTOR) -{ - // clang-format off - return - { - { pick_batch_size( 128, n), 1, 1, 1, 1}, - { pick_batch_size( 352, n), 128, 1, 1, 1}, - { pick_batch_size( 256, n), 128, 1, 1, 1}, - { pick_batch_size( 352, n), 32, 3, 3, 3}, - { pick_batch_size( 352, n), 4, 3, 3, 3}, - { pick_batch_size( 160, n), 4, 3, 5, 5}, - { pick_batch_size( 128, n), 64, 5, 7, 7}, - { pick_batch_size( 192, n), 4, 3, 11, 11}, - { pick_batch_size( 128, n), 1, 3, 1, 7}, - { pick_batch_size( 128, n), 1, 3, 7, 1}, - { pick_batch_size( 128, n), 1, 3, 5, 20} - }; - // clang-format on -} - -template -inline std::set> get_bn_peract_inputs(T n = MIOPEN_TEST_DEFAULT_BATCH_SIZE_FACTOR) -{ - // clang-format off - return - { - { pick_batch_size(32, n), 4, 1024,2048}, //Making this much smaller - { pick_batch_size(100, n), 3, 32, 32 }, - { pick_batch_size(100, n), 32, 8, 8 }, - { pick_batch_size(128, n), 256, 12, 12 }, - { pick_batch_size(256, n), 3, 227, 227 }, - { pick_batch_size(64, n), 64, 112, 112 },//Batch-norm ResNet 152 after this line - { pick_batch_size(256, n), 1024, 14, 14 },// n is from the paper @ 256 - { pick_batch_size(256, n), 2048, 7, 7 }, - { pick_batch_size(256, n), 256, 56, 56 }, - { pick_batch_size(256, n), 256, 14, 14 }, - { pick_batch_size(256, n), 512, 28, 28 }, - { pick_batch_size(256, n), 512, 7, 7 }, - { pick_batch_size(256, n), 64, 112, 112 }, - { pick_batch_size(256, n), 64, 56, 56 },//Batch-norm Inception_v3 after this - { pick_batch_size(32, n), 1024, 1, 1 },// n is from the paper @ 32 - { pick_batch_size(32, n), 128, 14, 14 }, - { pick_batch_size(32, n), 128, 28, 28 }, - { pick_batch_size(32, n), 128, 4, 4 }, - { pick_batch_size(32, n), 128, 7, 7 }, - { pick_batch_size(32, n), 160, 7, 7 }, - { pick_batch_size(32, n), 192, 14, 14 }, - { pick_batch_size(32, n), 192, 56, 56 }, - { pick_batch_size(32, n), 192, 7, 7 }, - { pick_batch_size(32, n), 224, 14, 14 }, - { pick_batch_size(32, n), 256, 7, 7 }, - { pick_batch_size(32, n), 256, 14, 14 }, - { pick_batch_size(32, n), 352, 7, 7 }, - { pick_batch_size(32, n), 64, 112, 112 }, - { pick_batch_size(32, n), 64, 14, 14 }, - { pick_batch_size(32, n), 64, 56, 56 }, - { pick_batch_size(32, n), 96, 28, 28 }, - { pick_batch_size(32, n), 32, 256, 512 }, //Killing this config. Takes way too long on the CPU - { pick_batch_size(32, n), 256, 28, 28 }, - { pick_batch_size(32, n), 3, 224, 224 }, - { pick_batch_size(32, n), 480, 128, 256 }, - { pick_batch_size(32, n), 528, 64, 128 } - }; - // clang-format on -} - -template -inline std::set> get_bn_spatial_inputs(T n = MIOPEN_TEST_DEFAULT_BATCH_SIZE_FACTOR) -{ - // clang-format off - return - { - { pick_batch_size(32, n), 4, 1024,2048}, //Making this much smaller - { pick_batch_size(32, n), 192, 256, 512 }, - { pick_batch_size(32, n), 480, 128, 256 }, - { pick_batch_size(256, n), 3, 227, 227 }, - { pick_batch_size(256, n), 64, 112, 112 }, - { pick_batch_size(512, n), 16, 32, 32 }, - { pick_batch_size(100, n), 32, 8, 8 }, - { pick_batch_size(128, n), 256, 12, 12 }, - { pick_batch_size(256, n), 128, 28, 28 }, - { pick_batch_size(256, n), 2048, 7, 7 }, - { pick_batch_size(256, n), 256, 56, 56 }, - { pick_batch_size(256, n), 256, 14, 14 }, - { pick_batch_size(256, n), 512, 28, 28 }, - { pick_batch_size(256, n), 512, 7, 7 }, - { pick_batch_size(256, n), 64, 56, 56 },//Batch-norm Inception_v3 after this - { pick_batch_size(32, n), 1024, 1, 1 },// n is from the paper @ 32 - { pick_batch_size(32, n), 128, 14, 14 }, - { pick_batch_size(32, n), 128, 4, 4 }, - { pick_batch_size(32, n), 160, 7, 7 }, - { pick_batch_size(32, n), 192, 14, 14 }, - { pick_batch_size(32, n), 192, 56, 56 }, - { pick_batch_size(32, n), 192, 7, 7 }, - { pick_batch_size(32, n), 224, 14, 14 }, - { pick_batch_size(32, n), 256, 7, 7 }, - { pick_batch_size(32, n), 352, 7, 7 }, - { pick_batch_size(32, n), 64, 14, 14 }, - { pick_batch_size(32, n), 64, 28, 28 }, - { pick_batch_size(32, n), 64, 56, 56 }, - { pick_batch_size(32, n), 96, 28, 28 }, - { pick_batch_size(32, n), 192, 256, 512 }, - { pick_batch_size(32, n), 256, 28, 28 }, - { pick_batch_size(32, n), 3, 224, 224 }, - { pick_batch_size(32, n), 480, 128, 256 }, - { pick_batch_size(32, n), 528, 64, 128 }, - { pick_batch_size(770, n), 1, 8, 8 }, - { pick_batch_size(770, n), 1024, 1, 1 }, - { pick_batch_size(152, n), 128, 80, 80 }, - { pick_batch_size(152, n), 256, 20, 20 }, - { pick_batch_size(152, n), 32, 160, 160 }, - { pick_batch_size(152, n), 512, 20, 20 }, - { pick_batch_size(152, n), 64, 160, 160 }, - { pick_batch_size(152, n), 64, 80, 80 }, - { pick_batch_size(256, n), 256, 20, 20 }, - { pick_batch_size(256, n), 512, 20, 20 } - }; - // clang-format on -} - -template -inline std::set> get_3d_bn_peract_inputs(T n = MIOPEN_TEST_DEFAULT_BATCH_SIZE_FACTOR) -{ - // clang-format off - return - { - { pick_batch_size(32, n), 1, 32, 32, 32 }, // 32x32x32 based on VoxNet arch - { pick_batch_size(32, n), 1, 14, 14, 14 }, - { pick_batch_size(32, n), 32, 14, 14, 14 }, - { pick_batch_size(32, n), 32, 12, 12, 12 }, - { pick_batch_size(32, n), 32, 6, 6, 6 }, - { pick_batch_size(256, n), 1, 32, 32, 32 }, // 32x32x32 based on VoxNet arch - { pick_batch_size(256, n), 32, 14, 14, 14 }, - { pick_batch_size(256, n), 32, 12, 12, 12 }, - { pick_batch_size(256, n), 32, 6, 6, 6 }, - { pick_batch_size(512, n), 1, 32, 32, 32 }, // 32x32x32 based on VoxNet arch - { pick_batch_size(512, n), 32, 14, 14, 14 }, - { pick_batch_size(512, n), 32, 12, 12, 12 }, - { pick_batch_size(512, n), 32, 6, 6, 6 }, - { pick_batch_size(32, n), 2, 32, 57, 125 }, // Hand-gesture recognition CVPR 2015 paper High Res Net Path - { pick_batch_size(32, n), 32, 14, 25, 59 }, - { pick_batch_size(32, n), 32, 6, 10, 27 }, - { pick_batch_size(32, n), 32, 4, 6, 11 }, - { pick_batch_size(32, n), 32, 2, 2, 3 }, - { pick_batch_size(32, n), 32, 32, 28, 62 }, // Hand-gesture recognition CVPR 2015 paper Low Res Net Path - { pick_batch_size(32, n), 32, 14, 12, 29 }, - { pick_batch_size(32, n), 32, 6, 4, 12 }, - { pick_batch_size(32, n), 32, 4, 2, 2 }, - { pick_batch_size(16, n), 32, 6, 50, 50 }, // Multi-view 3D convnet - { pick_batch_size(1, n), 3, 8, 240, 320 }, // 3D convet on video - { pick_batch_size(1, n), 3, 16, 240, 320 }, // 3D convet on video - { pick_batch_size(1, n), 3, 8, 128, 171 }, // 3D convet on video - { pick_batch_size(1, n), 3, 16, 128, 171 }, // 3D convet on video - { pick_batch_size(1, n), 3, 8, 112, 112 }, // 3D convet on video - { pick_batch_size(1, n), 3, 16, 112, 112 } // 3D convet on video - }; - - // clang-format on -} - -template -inline std::set> -get_3d_bn_spatial_inputs(T n = MIOPEN_TEST_DEFAULT_BATCH_SIZE_FACTOR) -{ - // clang-format off - return - { - { pick_batch_size(32, n), 1, 32, 32, 32 }, // 32x32x32 based on VoxNet arch - { pick_batch_size(32, n), 1, 14, 14, 14 }, - { pick_batch_size(32, n), 32, 14, 14, 14 }, - { pick_batch_size(32, n), 32, 12, 12, 12 }, - { pick_batch_size(32, n), 32, 6, 6, 6 }, - { pick_batch_size(256, n), 1, 32, 32, 32 }, // 32x32x32 based on VoxNet arch - { pick_batch_size(256, n), 32, 14, 14, 14 }, - { pick_batch_size(256, n), 32, 12, 12, 12 }, - { pick_batch_size(256, n), 32, 6, 6, 6 }, - { pick_batch_size(512, n), 1, 32, 32, 32 }, // 32x32x32 based on VoxNet arch - { pick_batch_size(512, n), 32, 14, 14, 14 }, - { pick_batch_size(512, n), 32, 12, 12, 12 }, - { pick_batch_size(512, n), 32, 6, 6, 6 }, - { pick_batch_size(32, n), 2, 32, 57, 125 }, // Hand-gesture recognition CVPR 2015 paper High Res Net Path - { pick_batch_size(32, n), 32, 14, 25, 59 }, - { pick_batch_size(32, n), 32, 6, 10, 27 }, - { pick_batch_size(32, n), 32, 4, 6, 11 }, - { pick_batch_size(32, n), 32, 2, 2, 3 }, - { pick_batch_size(32, n), 32, 32, 28, 62 }, // Hand-gesture recognition CVPR 2015 paper Low Res Net Path - { pick_batch_size(32, n), 32, 14, 12, 29 }, - { pick_batch_size(32, n), 32, 6, 4, 12 }, - { pick_batch_size(32, n), 32, 4, 2, 2 }, - { pick_batch_size(16, n), 32, 6, 50, 50 }, // Multi-view 3D convnet - { pick_batch_size(1, n), 3, 8, 240, 320 }, // 3D convet on video - { pick_batch_size(1, n), 3, 16, 240, 320 }, // 3D convet on video - { pick_batch_size(1, n), 3, 8, 128, 171 }, // 3D convet on video - { pick_batch_size(1, n), 3, 16, 128, 171 }, // 3D convet on video - { pick_batch_size(1, n), 3, 8, 112, 112 }, // 3D convet on video - { pick_batch_size(1, n), 3, 16, 112, 112 } // 3D convet on video - }; - // clang-format on -} - -template -inline std::vector> get_sub_tensor() -{ - return {{16, 4, 8, 1, 4}, - {2, 4, 8, 8, 4}, - {16, 4, 8, 4}, - {13, 8, 4, 8}, - {3, 8, 7}, - {16, 4, 10}, - {3, 8}, - {16, 4}, - {4}}; -} - -template -inline std::vector> get_tensor_offsets() -{ - static_assert(std::is_signed_v); - return {{0, 0}, {0, 2}, {4, 0}, {5, 7}}; -} - -template -inline std::vector get_tensor_offset() -{ - static_assert(std::is_signed_v); - return {0, 1, 2, 3, 4, 5}; -} - -#endif +// Forwarding header — implementation moved to miopen_utils. +#include diff --git a/projects/miopen/test/random.hpp b/projects/miopen/test/random.hpp index 62443abb1068..3bb99a37d6c9 100644 --- a/projects/miopen/test/random.hpp +++ b/projects/miopen/test/random.hpp @@ -1,62 +1,2 @@ -/******************************************************************************* - * - * MIT License - * - * Copyright (c) 2021 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - *******************************************************************************/ -#ifndef GUARD_MIOPEN_TEST_RANDOM_HPP -#define GUARD_MIOPEN_TEST_RANDOM_HPP - -#include "../driver/random.hpp" - -namespace prng { -template -inline T gen_descreet_uniform_sign(double scale, int32_t range) -{ - return static_cast(scale * prng::gen_A_to_B(-range + 1, range)); -} - -template -inline T gen_descreet_unsigned(double scale, int32_t range) -{ - return static_cast(scale * static_cast(gen_0_to_B(range))); -} - -} // namespace prng - -// lambda factory -template -auto uniform_signed_initializer(ScaleT scale_arg, RangeT range_arg) -{ - return [=](auto&&...) -> T { - // uniform sign give balance of both negative and positive values - return prng::gen_descreet_uniform_sign(scale_arg, range_arg); - }; -} - -template -auto uniform_unsigned_initializer(ScaleT scale_arg, RangeT range_arg) -{ - return [=](auto&&...) -> T { return prng::gen_descreet_unsigned(scale_arg, range_arg); }; -} - -#endif // GUARD_MIOPEN_TEST_RANDOM_HPP +// Forwarding header — implementation moved to miopen_utils. +#include diff --git a/projects/miopen/test/rnn_util.hpp b/projects/miopen/test/rnn_util.hpp index d993d0df4c57..0e771bfdfff1 100644 --- a/projects/miopen/test/rnn_util.hpp +++ b/projects/miopen/test/rnn_util.hpp @@ -1,305 +1,2 @@ -/******************************************************************************* - * - * MIT License - * - * Copyright (c) 2023 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - *******************************************************************************/ - -#ifndef MIOPEN_RNN_UTIL_H_ -#define MIOPEN_RNN_UTIL_H_ - -#include -#include -#include -#include -#include -#include -#include - -#include "gemm.hpp" -#include "random.hpp" - -#include - -// complexity O(NlogN) -inline std::vector GetReverseOrderIndex(const std::vector& base_index) -{ - std::vector reverse_index(base_index.size()); - unsigned next_rev_index = 0; - for(auto id : base_index) - reverse_index[id] = next_rev_index++; - return reverse_index; -}; - -inline std::vector GetSamplesIndexDescendingOrder(const std::vector& unsorted_seq_lens) -{ - const auto sample_count = unsorted_seq_lens.size(); - - std::vector index_v(sample_count); - std::iota(index_v.begin(), index_v.end(), 0); - - auto seq_len_cmp = [&unsorted_seq_lens](unsigned a_id, unsigned b_id) { - return unsorted_seq_lens[a_id] > unsorted_seq_lens[b_id]; - }; - - std::stable_sort(index_v.begin(), index_v.end(), seq_len_cmp); - - return index_v; -} - -template -inline void HiddenTensorReorder(const std::vector& src_array, - std::vector& dst_array, - const std::vector& batch_order, - const std::vector hid_len, - bool is_dst_direct_order) -{ - const size_t copy_size = hid_len[2]; - - const size_t batch_stride = hid_len[2]; - const size_t layer_stride = batch_stride * hid_len[1]; - - for(size_t batch_id = 0; batch_id < hid_len[1]; batch_id++) - { - const auto src_batch_off = - batch_stride * (is_dst_direct_order ? batch_order[batch_id] : batch_id); - const auto dst_batch_off = - batch_stride * (is_dst_direct_order ? batch_id : batch_order[batch_id]); - - for(size_t layer_id = 0; layer_id < hid_len[0]; layer_id++) - { - const auto dst_offset = dst_batch_off + layer_id * layer_stride; - const auto src_offset = src_batch_off + layer_id * layer_stride; - - std::copy(src_array.begin() + src_offset, - src_array.begin() + src_offset + copy_size, - dst_array.begin() + dst_offset); - } - } -} - -inline void createTensorDescArray(std::vector& td, - std::vector& ptd, - const std::vector bs, - const int secondDim, - miopenDataType_t dataType) -{ - - std::transform(bs.begin(), bs.end(), std::back_inserter(td), [&](int x) { - return miopen::TensorDescriptor( - dataType, {static_cast(x), static_cast(secondDim)}); - }); - std::transform(td.begin(), td.end(), std::back_inserter(ptd), [](miopen::TensorDescriptor& x) { - return &x; - }); -} - -inline std::tuple -GetTempPackedBuffersSize(std::vector batchs, int in_vec, int out_vec) -{ - size_t total_batch = std::accumulate(batchs.begin(), batchs.end(), 0ULL); - - size_t in_buff_size = total_batch * in_vec; - size_t out_buff_size = total_batch * out_vec; - return {in_buff_size, out_buff_size}; -} - -inline size_t getSuperTensorSize(const std::vector& bs, - int seqLength, - int inputSize, - int hiddenSize, - int maxPaddingVal, - bool isBidirect, - bool isInput, - bool isPadded) -{ - return (isPadded // - ? static_cast(seqLength) * maxPaddingVal - : std::accumulate(bs.begin(), bs.end(), 0ULL)) // - * (isInput // - ? static_cast(inputSize) - : static_cast(hiddenSize) * (isBidirect ? 2 : 1)); -} - -template -void ChangeDataPadding(const std::vector& src_array, - std::vector& dst_array, - const std::vector& batch_list, - int max_batch, - int sample_size, - bool is_src_packed) -{ - auto seq_len = batch_list.size(); - - auto scr_ptr = &src_array[0]; - auto dst_ptr = &dst_array[0]; - - for(int seq_id = 0; seq_id < seq_len; seq_id++) - { - auto packed_size = batch_list[seq_id] * sample_size; - - std::copy(scr_ptr, scr_ptr + packed_size, dst_ptr); - - if(is_src_packed) - { - dst_ptr += max_batch * sample_size; - scr_ptr += packed_size; - } - else - { - scr_ptr += max_batch * sample_size; - dst_ptr += packed_size; - } - } -} - -// RNN VANILLA configs -inline std::vector get_rnn_num_layers() { return {{1, 3}}; } - -inline std::vector get_rnn_batchSize() { return {{1, 17}}; } - -inline std::vector get_rnn_seq_len() { return {{1, 3, 51}}; } - -inline std::vector get_rnn_vector_len() { return {31}; } - -inline std::vector get_rnn_hidden_size() { return {127}; } - -// LSTM configs -inline std::vector get_lstm_num_layers() { return {{1, 3}}; } - -inline std::vector get_lstm_batchSize() { return {{1, 17}}; } - -inline std::vector get_lstm_seq_len() { return {{1, 25}}; } - -inline std::vector get_lstm_vector_len() { return {17}; } - -inline std::vector get_lstm_hidden_size() { return {67}; } - -// GRU configs -inline std::vector get_gru_num_layers() { return {{1, 3}}; } - -inline std::vector get_gru_batchSize() { return {{1, 17}}; } - -inline std::vector get_gru_seq_len() { return {{1, 23}}; } - -inline std::vector get_gru_vector_len() { return {13}; } - -inline std::vector get_gru_hidden_size() { return {67}; } - -inline std::vector> generate_batchSeq(const int batchSize, const int seqLength) -{ - - static constexpr int modval = 3; - - int currentval = batchSize; - std::vector batchSeq; - batchSeq.reserve(seqLength); - for(int i = 0; i < seqLength; i++) - { - if(i > 0) - { - int nvalue = currentval - prng::gen_0_to_B(modval); - currentval = (nvalue < 1) ? 1 : nvalue; - // printf("current value: %d\n", currentval); - } - // printf("adding a value to batch sequence: %d\n", currentval); - batchSeq.push_back(currentval); - } - return {batchSeq}; -} - -inline int sumvc(const std::vector& x) { return std::accumulate(x.begin(), x.end(), 0); } - -template -inline T activfunc(T x, int actvf) -{ - T alpha = static_cast(1), beta0 = static_cast(0), beta1 = static_cast(1); - if(actvf == 0) - { - return (x > 0) ? x : x * beta0; - } - else if(actvf == 2) - { - return static_cast(1 / (1 + std::exp(-x))); - } - return static_cast(alpha * std::tanh(beta1 * x)); -} - -template -inline T dervactivfunc(T x, int actvf) -{ - if(actvf == 0) - { - return static_cast(x > 0 ? 1 : 0); - } - else if(actvf == 2) - { - return static_cast(std::exp(-x) / (1 + std::exp(-x)) / (1 + std::exp(-x))); - } - - return static_cast(1 / std::cosh(x) / std::cosh(x)); -} - -template -void RNN_mm_cpu_batched(const Dtype* a_ptr, - size_t a_cols, - size_t a_rows, - size_t lda, - size_t a_stride, - int a_flags, - const Dtype* b_ptr, - size_t b_cols, - size_t b_rows, - size_t ldb, - size_t b_stride, - int b_flags, - Dtype* c_ptr, - size_t c_cols, - size_t c_rows, - size_t ldc, - size_t c_stride, - int batchCount, - double alpha, - double beta) -{ - for(int i = 0; i < batchCount; ++i) - { - gemm_cpu(a_ptr + a_stride * i, - a_cols, - a_rows, - lda, - a_flags == 1 ? true : false, - b_ptr + b_stride * i, - b_cols, - b_rows, - ldb, - b_flags == 1 ? true : false, - c_ptr + c_stride * i, - c_cols, - c_rows, - ldc, - alpha, - beta); - } -} - -#endif +// Forwarding header — implementation moved to miopen_utils. +#include diff --git a/projects/miopen/test/serialize.hpp b/projects/miopen/test/serialize.hpp index 6b9b1b29632e..c3eb459c38df 100644 --- a/projects/miopen/test/serialize.hpp +++ b/projects/miopen/test/serialize.hpp @@ -1,129 +1,2 @@ -/******************************************************************************* - * - * MIT License - * - * Copyright (c) 2018 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - *******************************************************************************/ - -#ifndef MIOPEN_GUARD_TEST_SERIALIZE_HPP -#define MIOPEN_GUARD_TEST_SERIALIZE_HPP - -#include -#include -#include -#include -#include -#include -#include - -template -struct is_trivial_serializable : std::is_trivially_copy_constructible -{ -}; - -template <> -struct is_trivial_serializable : std::true_type -{ -}; - -template -std::enable_if_t{}> serialize(std::ostream& os, const T& x) -{ - os.write(reinterpret_cast(&x), sizeof(T)); -} - -template -auto serialize(std::ostream& os, - const T& x) -> decltype(x.begin(), x.end(), T(x.begin(), x.end()), void()) -{ - std::size_t n = std::distance(x.begin(), x.end()); - serialize(os, n); - for(auto&& y : x) - serialize(os, y); -} - -template -std::enable_if_t>{}> -serialize(std::ostream& os, const std::tuple& t) -{ - miopen::unpack( - [&](auto&&... xs) { miopen::each_args([&](auto&& x) { serialize(os, x); }, xs...); }, t); -} - -template -std::enable_if_t{}> serialize(std::istream& is, T& x) -{ - is.read(reinterpret_cast(&x), sizeof(T)); -} - -template -std::enable_if_t{}> serialize(std::istream& is, std::vector& x) -{ - std::size_t n; - serialize(is, n); - x.resize(n); - is.read(reinterpret_cast(x.data()), sizeof(T) * n); -} - -template -auto serialize(std::istream& is, - T& x) -> decltype(x.begin(), x.end(), x.assign(x.begin(), x.end()), void()) -{ - using value_type = std::decay_t; - std::size_t n; - serialize(is, n); - std::vector v; - v.reserve(n); - for(std::size_t i = 0; i < n; i++) - { - value_type y; - serialize(is, y); - v.push_back(y); - } - x.assign(v.begin(), v.end()); -} - -template -std::enable_if_t>{}> -serialize(std::istream& is, - // cppcheck-suppress constParameter - std::tuple& t) -{ - miopen::unpack( - [&](auto&&... xs) { miopen::each_args([&](auto&& x) { serialize(is, x); }, xs...); }, t); -} - -template -void load(std::string name, T& x) -{ - std::ifstream is{name.c_str()}; - serialize(is, x); -} - -template -void save(std::string name, const T& x) -{ - std::ofstream os{name.c_str()}; - serialize(os, x); -} - -#endif +// Forwarding header — implementation moved to miopen_utils. +#include diff --git a/projects/miopen/test/tensor_holder.hpp b/projects/miopen/test/tensor_holder.hpp index 64be2aa7c851..bc10b5a8b12d 100644 --- a/projects/miopen/test/tensor_holder.hpp +++ b/projects/miopen/test/tensor_holder.hpp @@ -1,505 +1,2 @@ -/******************************************************************************* - * - * MIT License - * - * Copyright (c) 2017 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - *******************************************************************************/ -#ifndef GUARD_TENSOR_HOLDER_HPP -#define GUARD_TENSOR_HOLDER_HPP - -#include "network_data.hpp" -#include -#include -#include -#include -#include -#include -#include "../driver/random.hpp" - -#include "serialize.hpp" - -#include -using half = half_float::half; -using hip_bfloat16 = bfloat16; -#include "../../src/kernels/hip_float8.hpp" -using float8_fnuz = miopen_f8::hip_f8; -using bfloat8_fnuz = miopen_f8::hip_f8; - -#include -#include - -template -void visit_tensor_size(std::size_t n, F f) -{ - switch(n) - { - case 0: { - f(std::integral_constant{}); - break; - } - case 1: { - f(std::integral_constant{}); - break; - } - case 2: { - f(std::integral_constant{}); - break; - } - case 3: { - f(std::integral_constant{}); - break; - } - case 4: { - f(std::integral_constant{}); - break; - } - case 5: { - f(std::integral_constant{}); - break; - } - default: throw std::runtime_error("Unknown tensor size"); - } -} - -template -struct miopen_type; - -template <> -struct miopen_type : std::integral_constant -{ -}; - -template <> -struct miopen_type : std::integral_constant -{ -}; - -template <> -struct miopen_type : std::integral_constant -{ -}; -template <> -struct miopen_type : std::integral_constant -{ -}; - -template <> -struct miopen_type : std::integral_constant -{ -}; - -template <> -struct miopen_type : std::integral_constant -{ -}; - -template <> -struct miopen_type : std::integral_constant -{ -}; - -template <> -struct miopen_type : std::integral_constant -{ -}; - -template <> -struct miopen_type : std::integral_constant -{ -}; - -template <> -struct miopen_type : std::integral_constant -{ -}; - -template <> -struct miopen_type : std::integral_constant -{ -}; - -template <> -struct miopen_type : std::integral_constant -{ -}; - -template -struct tensor -{ - using value_type = T; - miopen::TensorDescriptor desc; - std::vector data; - -#if defined(__clang__) || defined(__GNUG__) -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wdeprecated-declarations" -#endif - - tensor() : desc(miopen_type{}) {} - -#if defined(__clang__) || defined(__GNUG__) -#pragma GCC diagnostic pop -#endif - - template - tensor(const std::vector& dims) : desc(miopen_type{}, dims), data(desc.GetElementSpace()) - { - } - - template - tensor(const std::vector& dims, const std::vector& strides) - : desc(miopen_type{}, dims, strides), data(desc.GetElementSpace()) - { - assert(dims.size() == strides.size()); - } - - template - tensor(miopenTensorLayout_t layout, const std::vector& dims) - : desc(miopen_type{}, layout, dims), data(desc.GetElementSpace()) - { - } - - template - tensor(miopenTensorLayout_t layout, const std::vector& dims, const std::vector& strides) - : desc(miopen_type{}, layout, dims, strides), data(desc.GetElementSpace()) - { - assert(dims.size() == strides.size()); - } - - tensor(std::size_t n, std::size_t c, std::size_t h, std::size_t w) - : desc(miopen_type{}, {n, c, h, w}), data(n * c * h * w) - { - } - - tensor(miopenTensorLayout_t layout, std::size_t n, std::size_t c, std::size_t h, std::size_t w) - : desc(miopen_type{}, layout, {n, c, h, w}), data(desc.GetElementSpace()) - { - } - - tensor(std::size_t n, std::size_t c, std::size_t d, std::size_t h, std::size_t w) - : desc(miopen_type{}, {n, c, d, h, w}), data(n * c * d * h * w) - { - } - - tensor(std::size_t n) : desc(miopen_type{}, {n}), data(n) {} - - tensor(miopen::TensorDescriptor rhs) : desc(std::move(rhs)) - { - assert(desc.GetType() == miopen_type{} - /// In the driver, T is input tensor type, but output tensor holders - /// are instantiatied with T as well. This leads to false assertion - /// failures when T is INT8 because output type is different. - /// \todo Get rid of this hack when the driver is improved: - || (miopen_type{} == miopenInt8 && desc.GetType() == miopenInt32)); - data.resize(desc.GetElementSpace()); - } - - size_t GetDataByteSize() const { return GetSize() * sizeof(T); } - - size_t GetSize() const { return desc.GetElementSpace(); } - - template - tensor& generate(G g) & - { - if(this->desc.GetVectorLength() > 1) - this->generate_vect_impl(g); - else - this->generate_impl(g); - return *this; - } - - template - tensor&& generate(G g) && - { - if(this->desc.GetVectorLength() > 1) - this->generate_vect_impl(g); - else - this->generate_impl(g); - return std::move(*this); - } - - template - void generate_impl(G g) - { - auto seed = std::accumulate(desc.GetLengths().begin(), - desc.GetLengths().end(), - std::size_t{521288629}, - [](auto x, auto y) { - x ^= x << 1U; - return x ^ y; - }); - seed ^= data.size(); - seed ^= desc.GetLengths().size(); - prng::reset_seed(seed); - auto iterator = data.begin(); - auto assign = [&](T x) { - *iterator = x; - ++iterator; - }; - this->for_each( - miopen::compose(miopen::compose(assign, miopen::cast_to()), std::move(g))); - } - - template - void generate_vect_impl(G g) - { - auto seed = std::accumulate(desc.GetLengths().begin(), - desc.GetLengths().end(), - std::size_t{521288629}, - [](auto x, auto y) { - x ^= x << 1U; - return x ^ y; - }); - seed ^= data.size(); - seed ^= desc.GetLengths().size(); - prng::reset_seed(seed); - auto iterator = data.begin(); - auto vectorLength = desc.GetVectorLength(); - auto assign = [&](T x) { - assert(iterator < data.end()); - // for debugging - for(auto i = 0; i < vectorLength; i++) - { - *(iterator + i) = x; - } - iterator += vectorLength; - }; - this->for_each( - miopen::compose(miopen::compose(assign, miopen::cast_to()), std::move(g))); - } - - template - struct for_each_unpacked - { - Loop loop; - F f; - template - auto operator()(Ts... xs) const -> decltype(f(xs...), void()) - { - loop(xs...)(std::move(f)); - } - - struct any - { - any() {} - template - any(X) - { - } - }; - - [[noreturn]] void operator()(any = {}, - any = {}, - any = {}, - any = {}, - any = {}, - any = {}, - any = {}, - any = {}, - any = {}) const - { - throw std::runtime_error( - "Arguments to for_each do not match tensor size or the function " + - miopen::get_type_name() + " can not be called."); - } - }; - - struct for_each_handler - { - template - void operator()(Self* self, Loop loop, F f, Size size) const - { - auto dims = miopen::tien(self->desc.GetLengths()); - miopen::unpack(for_each_unpacked{loop, std::move(f)}, dims); - } - }; - - template - void for_each(F f) const - { - visit_tensor_size( - desc.GetLengths().size(), - std::bind(for_each_handler{}, this, miopen::ford, std::move(f), std::placeholders::_1)); - } - - template - void par_for_each(F f) const - { - visit_tensor_size( - desc.GetLengths().size(), - std::bind( - for_each_handler{}, this, miopen::par_ford, std::move(f), std::placeholders::_1)); - } - - template - T& operator()(Ts... xs) - { - assert(this->desc.GetIndex(xs...) < data.size()); - return this->data[this->desc.GetIndex(xs...)]; - } - - template - const T& operator()(Ts... xs) const - { - assert(this->desc.GetIndex(xs...) < data.size()); - return this->data[this->desc.GetIndex(xs...)]; - } - - template - const T& operator()(const std::array& multi_id) const - { - auto f = [&](auto... is) { return this->desc.GetIndex(is...); }; - assert(miopen::unpack(f, multi_id) < data.size()); - return this->data[miopen::unpack(f, multi_id)]; - } - - T& operator[](std::size_t i) { return data.at(i); } - - const T& operator[](std::size_t i) const { return data.at(i); } - - typename std::vector::iterator begin() { return data.begin(); } - - typename std::vector::iterator end() { return data.end(); } - - typename std::vector::const_iterator begin() const { return data.begin(); } - - typename std::vector::const_iterator end() const { return data.end(); } - - friend std::ostream& operator<<(std::ostream& stream, const tensor& t) - { - return stream << t.desc; - } - - template - void dump_inner(size_t dim, std::array& coord, Stream& stream) const - { - const auto lengths = this->desc.GetLengths(); - if(lengths.size() == 0) - { - // 0D special case: Just print the one value that we have and return. - stream << (*this)(coord); - } - else if(dim + 1 == lengths.size()) - { - // 1D special case: dump everything on one line - for(size_t i = 0; i < lengths[dim]; ++i) - { - if(i != 0) - stream << ' '; - - coord[dim] = i; - stream << std::setw(4) << (*this)(coord); - } - - stream << '\n'; - } - else - { - if(dim + 2 == lengths.size()) - { - // 2D special case: Also print which 2D slice we are currently printing - // Note: this is not needed for higher dimensions, as they will also pass - // through this branch. - stream << "slice ["; - for(size_t i = 0; i < dim; ++i) - { - stream << coord[i] << ", "; - } - stream << ":, :]\n"; - } - - for(size_t i = 0; i < lengths[dim]; ++i) - { - coord[dim] = i; - this->dump_inner(dim + 1, coord, stream); - } - } - } - - template - void dump(const char* name, Stream& stream = std::cout) const - { - const auto n = this->desc.GetLengths().size(); - stream << "==== " << name << ": " << *this << n << '\n'; - stream.fill(' '); - - const auto flags = stream.flags(); - - visit_tensor_size(n, [&](const auto size) { - constexpr size_t N = decltype(size)::value; - std::array coord; - this->dump_inner(0, coord, stream); - }); - - stream.flags(flags); - } -}; - -template -void serialize(std::istream& s, tensor& x) -{ - std::vector lens; - serialize(s, lens); - std::vector strides; - serialize(s, strides); - x.desc = miopen::TensorDescriptor{miopen_type{}, lens, strides}; - serialize(s, x.data); -} - -template -void serialize(std::ostream& s, const tensor& x) -{ - const auto& lens = x.desc.GetLengths(); - const auto& strides = x.desc.GetStrides(); - serialize(s, lens); - serialize(s, strides); - serialize(s, x.data); -} - -struct tensor_generate -{ - template - Tensor&& operator()(Tensor&& t, G g) const - { - return std::forward(t.generate(g)); - } -}; - -struct tensor_elem_gen_integer -{ - uint64_t max_value = 17; - - template - double operator()(Ts... Xs) const - { - static_assert(sizeof...(Ts) < 6, - "Dimensions in tensor_elem_gen_integer must be less than 6."); - assert(max_value > 0); - std::array left = {{Xs...}}; - std::array right = {{613, 547, 701, 877, 1049}}; - uint64_t dot = - std::inner_product(left.begin(), left.end(), right.begin(), static_cast(173)); - return static_cast(dot % max_value); - } -}; - -#endif +// Forwarding header — implementation moved to miopen_utils. +#include diff --git a/projects/miopen/test/verify.hpp b/projects/miopen/test/verify.hpp index 1d7d9cf80a50..8807b5ecfe2b 100644 --- a/projects/miopen/test/verify.hpp +++ b/projects/miopen/test/verify.hpp @@ -1,245 +1,2 @@ -/******************************************************************************* - * - * MIT License - * - * Copyright (c) 2017 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - *******************************************************************************/ -#ifndef GUARD_VERIFY_HPP -#define GUARD_VERIFY_HPP - -#include -#include -#include -#include -#include -#include -#include -#include -using half = half_float::half; -using hip_bfloat16 = bfloat16; -#include -#include "tensor_holder.hpp" - -namespace miopen { - -// Compute the value of a range -template -using range_value = typename std::decay().begin())>::type; - -struct sum_fn -{ - template - auto operator()(T x, U y) const MIOPEN_RETURNS(x + y); -}; -static constexpr sum_fn sum{}; - -struct max_fn -{ - template - static T id(T x) - { - return x; - } - - template - auto operator()(T x, U y) const MIOPEN_RETURNS(max_fn::id(x > y ? x : y)); -}; -static constexpr max_fn max{}; - -namespace abs_diff_detail { -using std::fabs; -struct fn -{ - template - auto operator()(T x, U y) const MIOPEN_RETURNS(fabs(x - y)); -}; - -} // namespace abs_diff_detail - -static constexpr abs_diff_detail::fn abs_diff{}; - -struct not_finite_fn -{ - template ), bool>::type = false> - bool operator()(T x) const - { - return !std::isfinite(x); - } - - template ::type, half_float::half>), - bool>::type = false> - bool operator()(T x) const - { - return !half_float::isfinite(x); - } - - template ::type, bfloat16>), - bool>::type = false> - bool operator()(T x) const - { - return !std::isfinite(x); // bfloat16 has float() conversion operator - } - - template ), bool>::type = false> - bool operator()(T x) const - { - std::ignore = x; - return false; - } -}; -static constexpr not_finite_fn not_finite{}; - -template -T as(T, U x) -{ - return x; -} - -struct compare_mag_fn -{ - template - bool operator()(T x, U y) const - { - using std::fabs; - return fabs(x) < fabs(y); - } -}; -static constexpr compare_mag_fn compare_mag{}; - -struct square_diff_fn -{ - template - double operator()(T x, U y) const - { - double diff = static_cast(x - y); - return diff * diff; - } -}; -static constexpr square_diff_fn square_diff{}; - -template , bool> = true> -bool equal_values(T const& lhs, T const& rhs) -{ - return lhs == rhs; -} - -template , bool> = true> -bool equal_values(T const& lhs, T const& rhs) -{ - return miopen::float_equal_sentinel(lhs, rhs); -} - -template -bool range_empty(R1&& r1) -{ - return r1.begin() == r1.end(); -} - -template -auto range_distance(R1&& r1) MIOPEN_RETURNS(std::distance(r1.begin(), r1.end())); - -template -bool range_zero(const std::vector& r) -{ - return std::all_of(r.begin(), r.end(), [](T x) { return equal_values(x, T()); }); -} - -template -bool range_zero(const tensor& r) -{ - return range_zero(r.data); -} - -template -T range_product(R1&& r1, R2&& r2, T state, Reducer r, Product p) -{ - return std::inner_product(r1.begin(), r1.end(), r2.begin(), state, r, p); -} - -template -std::size_t mismatch_idx(R1&& r1, R2&& r2, Compare compare) -{ - auto p = std::mismatch(r1.begin(), r1.end(), r2.begin(), compare); - return std::distance(r1.begin(), p.first); -} - -template -int64_t find_idx(R1&& r1, Predicate p) -{ - auto it = std::find_if(r1.begin(), r1.end(), p); - if(it == r1.end()) - return -1; - else - return std::distance(r1.begin(), it); -} - -template -double max_diff(R1&& r1, R2&& r2) -{ - return range_product(r1, r2, 0.0, max, abs_diff); -} - -template -auto max_diff_v2(R1&& r1, R2&& r2) -{ - using T = decltype(r1[0] - r2[0]); - auto abs_diff_func = [](auto x, auto y) { return x > y ? x - y : y - x; }; - // BUG: deduced wrong datatype, half_float bug - if constexpr(std::is_same_v) - return range_product(r1, r2, half_float::half(), max, abs_diff_func); - else - return range_product(r1, r2, T(), max, abs_diff_func); -} - -template -std::size_t mismatch_diff(R1&& r1, R2&& r2, T diff) -{ - return mismatch_idx( - r1, - r2, - std::bind( - float_equal, diff, std::bind(abs_diff, std::placeholders::_1, std::placeholders::_2))); -} - -template -double rms_range(R1&& r1, R2&& r2) -{ - std::size_t n = range_distance(r1); - if(n == range_distance(r2)) - { - if(n == 0) - return 0; - double square_difference = range_product(r1, r2, 0.0, sum_fn{}, square_diff); - double mag1 = static_cast(*std::max_element(r1.begin(), r1.end(), compare_mag)); - double mag2 = static_cast(*std::max_element(r2.begin(), r2.end(), compare_mag)); - double mag = - std::max({std::fabs(mag1), std::fabs(mag2), std::numeric_limits::min()}); - return std::sqrt(square_difference) / (std::sqrt(n) * mag); - } - else - return double(std::numeric_limits>::max()); -} -} // namespace miopen -#endif +// Forwarding header — implementation moved to miopen_utils. +#include