halide
diff --git a/‎src/ApproximationTables.cpp‎
Lines changed: 1 addition & 1 deletion b/‎src/ApproximationTables.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/FastMathFunctions.cpp‎
Lines changed: 79 additions & 0 deletions b/‎src/FastMathFunctions.cpp‎
Lines changed: 79 additions & 0 deletions
diff --git a/‎src/FastMathFunctions.h‎
Lines changed: 3 additions & 0 deletions b/‎src/FastMathFunctions.h‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎src/IROperator.cpp‎
Lines changed: 1 addition & 0 deletions b/‎src/IROperator.cpp‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/IROperator.h‎
Lines changed: 5 additions & 2 deletions b/‎src/IROperator.h‎
Lines changed: 5 additions & 2 deletions
@@ -168,7 +168,7 @@ const Approximation *find_best_approximation(const std::vector<Approximation> &t
     std::printf("Looking for min_terms=%d, max_absolute_error=%f\n",
                 precision.constraint_min_poly_terms, precision.constraint_max_absolute_error);
 #endif
-    constexpr double safety_factor = 1.05;
+    constexpr double safety_factor = 1.02;
     for (size_t i = 0; i < table.size(); ++i) {
         const Approximation &e = table[i];
 
 
@@ -316,13 +316,18 @@ Expr fast_tanh(const Expr &x, ApproximationPrecision prec) {
         // Positive arguments to exp() have preciser ULP.
         // So, we will rewrite the expression to always use exp(2*x)
         // instead of exp(-2*x) when we are close to zero.
+        // Rewriting it like this is slighlty more expensive, hence the branch
+        // to only pay this extra cost in case we need MULPE-optimized approximations.
         Expr flip_exp = abs_x > constant(type, 4);
         Expr arg_exp = select(flip_exp, -abs_x, abs_x);
         Expr exp2x = Halide::fast_exp(2 * arg_exp, prec);
         Expr tanh = (exp2x - constant(type, 1.0)) / (exp2x + constant(type, 1));
         tanh = select(flip_exp ^ flip_sign, -tanh, tanh);
         return common_subexpression_elimination(tanh, true);
     } else {
+        // Even if we are optimizing for MAE, the nested call to exp()
+        // should be MULPE optimized for accuracy, as we are taking ratios.
+        prec.optimized_for = ApproximationPrecision::MULPE;
         Expr exp2x = Halide::fast_exp(-2 * abs_x, prec);
         Expr tanh = (constant(type, 1) - exp2x) / (constant(type, 1) + exp2x);
         tanh = select(flip_sign, -tanh, tanh);
@@ -435,6 +440,57 @@ IntrinsicsInfoPerDeviceAPI ii_tanh{
 }};
 // clang-format on
 
+bool fast_math_func_has_intrinsic_based_implementation(Call::IntrinsicOp op, DeviceAPI device, const Target &t) {
+    const IntrinsicsInfoPerDeviceAPI *iipda = nullptr;
+    switch (op) {
+    case Call::fast_atan:
+    case Call::fast_atan2:
+        iipda = &ii_atan_atan2;
+        break;
+    case Call::fast_cos:
+        iipda = &ii_cos;
+        break;
+    case Call::fast_exp:
+        iipda = &ii_exp;
+        break;
+    case Call::fast_log:
+        iipda = &ii_log;
+        break;
+    case Call::fast_pow:
+        iipda = &ii_pow;
+        break;
+    case Call::fast_sin:
+        iipda = &ii_sin;
+        break;
+    case Call::fast_tan:
+        iipda = &ii_tan;
+        break;
+    case Call::fast_tanh:
+        iipda = &ii_tanh;
+        break;
+
+    default:
+        std::string name = Call::get_intrinsic_name(op);
+        internal_assert(name.length() > 5 && name.substr(0, 5) != "fast_") << "Did not handle " << name << " in switch case";
+        break;
+    }
+
+
+    internal_assert(iipda != nullptr) << "Function is only supported for fast_xxx math functions. Got: " << Call::get_intrinsic_name(op);
+
+    for (const auto &cand : iipda->device_apis) {
+        if (cand.device_api == device) {
+            if (cand.intrinsic.defined()) {
+                if (op == Call::fast_tanh && device == DeviceAPI::CUDA) {
+                    return t.get_cuda_capability_lower_bound() >= 75;
+                }
+                return true;
+            }
+        }
+    }
+    return false;
+}
+
 IntrinsicsInfo resolve_precision(ApproximationPrecision &prec, const IntrinsicsInfoPerDeviceAPI &iida, DeviceAPI api) {
     IntrinsicsInfo ii{};
     for (const auto &cand : iida.device_apis) {
@@ -562,6 +618,18 @@ class LowerFastMathFunctions : public IRMutator {
         return for_device_api == DeviceAPI::CUDA && target.get_cuda_capability_lower_bound() >= 75;
     }
 
+    void adjust_precision_for_target(ApproximationPrecision &prec) {
+        if (for_device_api == DeviceAPI::None) {
+            if (target.arch == Target::Arch::X86) {
+                // If we do not have fused-multiply-add, we lose some precision.
+                if (target.bits == 32 || !target.has_feature(Target::Feature::FMA)) {
+                    prec.constraint_max_absolute_error *= 0.5f;
+                    prec.constraint_max_ulp_error /= 2;
+                }
+            }
+        }
+    }
+
     /** Strips the fast_ prefix, appends the type suffix, and
      * drops the precision argument from the end. */
     Expr to_native_func(const Call *op) {
@@ -652,6 +720,7 @@ class LowerFastMathFunctions : public IRMutator {
             }
 
             // No known fast version available, we will expand our own approximation.
+            adjust_precision_for_target(prec);
             return ApproxImpl::fast_sin(mutate(op->args[0]), prec);
         } else if (op->is_intrinsic(Call::fast_cos)) {
             ApproximationPrecision prec = extract_approximation_precision(op);
@@ -664,6 +733,7 @@ class LowerFastMathFunctions : public IRMutator {
             }
 
             // No known fast version available, we will expand our own approximation.
+            adjust_precision_for_target(prec);
             return ApproxImpl::fast_cos(mutate(op->args[0]), prec);
         } else if (op->is_intrinsic(Call::fast_atan) || op->is_intrinsic(Call::fast_atan2)) {
             // Handle fast_atan and fast_atan2 together!
@@ -673,6 +743,8 @@ class LowerFastMathFunctions : public IRMutator {
                 // The native atan is fast: fall back to native and continue lowering.
                 return to_native_func(op);
             }
+
+            adjust_precision_for_target(prec);
             if (op->is_intrinsic(Call::fast_atan)) {
                 return ApproxImpl::fast_atan(mutate(op->args[0]), prec);
             } else {
@@ -696,6 +768,8 @@ class LowerFastMathFunctions : public IRMutator {
                 // The native atan is fast: fall back to native and continue lowering.
                 return to_native_func(op);
             }
+
+            adjust_precision_for_target(prec);
             return ApproxImpl::fast_tan(mutate(op->args[0]), prec);
         } else if (op->is_intrinsic(Call::fast_exp)) {
             // Handle fast_exp and fast_log together!
@@ -718,6 +792,8 @@ class LowerFastMathFunctions : public IRMutator {
                 // The native atan is fast: fall back to native and continue lowering.
                 return to_native_func(op);
             }
+
+            adjust_precision_for_target(prec);
             return ApproxImpl::fast_exp(mutate(op->args[0]), prec);
         } else if (op->is_intrinsic(Call::fast_log)) {
             // Handle fast_exp and fast_log together!
@@ -738,6 +814,8 @@ class LowerFastMathFunctions : public IRMutator {
                 // The native atan is fast: fall back to native and continue lowering.
                 return to_native_func(op);
             }
+
+            adjust_precision_for_target(prec);
             return ApproxImpl::fast_log(mutate(op->args[0]), prec);
         } else if (op->is_intrinsic(Call::fast_tanh)) {
             ApproximationPrecision prec = extract_approximation_precision(op);
@@ -748,6 +826,7 @@ class LowerFastMathFunctions : public IRMutator {
             }
 
             // Expand using defintion in terms of exp(2x), and recurse.
+            // Note: no adjustment of precision, as the recursed mutation will take care of that!
             return mutate(ApproxImpl::fast_tanh(op->args[0], prec));
         } else if (op->is_intrinsic(Call::fast_pow)) {
             ApproximationPrecision prec = extract_approximation_precision(op);
 
@@ -2,10 +2,13 @@
 #define HALIDE_INTERNAL_FAST_MATH_H
 
 #include "Expr.h"
+#include "IR.h"
 
 namespace Halide {
 namespace Internal {
 
+bool fast_math_func_has_intrinsic_based_implementation(Call::IntrinsicOp op, DeviceAPI device, const Target &t);
+
 Stmt lower_fast_math_functions(const Stmt &s, const Target &t);
 
 }
 
@@ -1383,6 +1383,7 @@ Expr fast_pow(const Expr &x, const Expr &y, ApproximationPrecision prec) {
     if (auto i = as_const_int(y)) {
         return raise_to_integer_power(x, *i);
     }
+    user_assert(x.type() == Float(32) && y.type() == Float(32)) << "fast_exp only works for Float(32)";
     return Call::make(x.type(), Call::fast_pow, {x, y, make_approximation_precision_info(prec)}, Call::PureIntrinsic);
 }
 
 
@@ -1073,9 +1073,11 @@ struct ApproximationPrecision {
  * See \ref ApproximationPrecision for details on specifying precision.
  */
 // @{
-//* On NVIDIA CUDA: default-precision maps to a dedicated sin.approx.f32 instruction. */
+/** Caution: Might exceed the range (-1, 1) by a tiny bit.
+ * On NVIDIA CUDA: default-precision maps to a dedicated sin.approx.f32 instruction. */
 Expr fast_sin(const Expr &x, ApproximationPrecision precision = {});
-/** On NVIDIA CUDA: default-precision maps to a dedicated cos.approx.f32 instruction. */
+/** Caution: Might exceed the range (-1, 1) by a tiny bit.
+ * On NVIDIA CUDA: default-precision maps to a dedicated cos.approx.f32 instruction. */
 Expr fast_cos(const Expr &x, ApproximationPrecision precision = {});
 /** On NVIDIA CUDA: default-precision maps to a combination of sin.approx.f32,
  * cos.approx.f32, div.approx.f32 instructions. */
@@ -1118,6 +1120,7 @@ Expr fast_pow(const Expr &x, const Expr &y, ApproximationPrecision precision = {
 
 /** Fast approximate pow for Float(32).
  * Approximations accurate to 2e-7 MAE, and Max 2500 ULPs (on average < 1 ULP) available.
+ * Caution: might exceed the range (-1, 1) by a tiny bit.
  * Vectorizes cleanly when using polynomials.
  * Slow on x86 if you don't have at least sse 4.1.
  * On NVIDIA CUDA: default-precision maps to a combination of ex2.approx.f32 and lg2.approx.f32.
Original file line number	Diff line number	Diff line change
`@@ -1383,6 +1383,7 @@ Expr fast_pow(const Expr &x, const Expr &y, ApproximationPrecision prec) {`
`1383`	`1383`	`if (auto i = as_const_int(y)) {`
`1384`	`1384`	`return raise_to_integer_power(x, *i);`
`1385`	`1385`	`}`
	`1386`	`+ user_assert(x.type() == Float(32) && y.type() == Float(32)) << "fast_exp only works for Float(32)";`
`1386`	`1387`	`return Call::make(x.type(), Call::fast_pow, {x, y, make_approximation_precision_info(prec)}, Call::PureIntrinsic);`
`1387`	`1388`	`}`
`1388`	`1389`