stan-dev
diff --git a/‎stan/math/opencl/kernel_generator/elt_function_cl.hpp‎
Lines changed: 53 additions & 3 deletions b/‎stan/math/opencl/kernel_generator/elt_function_cl.hpp‎
Lines changed: 53 additions & 3 deletions
diff --git a/‎stan/math/opencl/kernels/device_functions/binomial_coefficient_log.hpp‎
Lines changed: 1 addition & 1 deletion b/‎stan/math/opencl/kernels/device_functions/binomial_coefficient_log.hpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎stan/math/opencl/kernels/device_functions/lbeta.hpp‎
Lines changed: 2 additions & 2 deletions b/‎stan/math/opencl/kernels/device_functions/lbeta.hpp‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎stan/math/opencl/kernels/device_functions/std_normal_lcdf.hpp‎
Lines changed: 168 additions & 0 deletions b/‎stan/math/opencl/kernels/device_functions/std_normal_lcdf.hpp‎
Lines changed: 168 additions & 0 deletions
diff --git a/‎stan/math/opencl/prim/exp_mod_normal_lcdf.hpp‎
Lines changed: 57 additions & 10 deletions b/‎stan/math/opencl/prim/exp_mod_normal_lcdf.hpp‎
Lines changed: 57 additions & 10 deletions
diff --git a/‎stan/math/opencl/prim/gumbel_cdf.hpp‎
Lines changed: 1 addition & 2 deletions b/‎stan/math/opencl/prim/gumbel_cdf.hpp‎
Lines changed: 1 addition & 2 deletions
@@ -24,6 +24,7 @@
 #include <stan/math/opencl/kernels/device_functions/multiply_log.hpp>
 #include <stan/math/opencl/kernels/device_functions/Phi.hpp>
 #include <stan/math/opencl/kernels/device_functions/Phi_approx.hpp>
+#include <stan/math/opencl/kernels/device_functions/std_normal_lcdf.hpp>
 #include <stan/math/opencl/kernels/device_functions/trigamma.hpp>
 #include <stan/math/opencl/matrix_cl_view.hpp>
 #include <stan/math/opencl/kernel_generator/common_return_scalar.hpp>
@@ -314,6 +315,12 @@ ADD_UNARY_FUNCTION_WITH_INCLUDES(Phi, opencl_kernels::phi_device_function)
 ADD_UNARY_FUNCTION_WITH_INCLUDES(Phi_approx,
                                  opencl_kernels::inv_logit_device_function,
                                  opencl_kernels::phi_approx_device_function)
+ADD_UNARY_FUNCTION_WITH_INCLUDES(
+    std_normal_lcdf_scaled_impl,
+    opencl_kernels::std_normal_lcdf_device_function)
+ADD_UNARY_FUNCTION_WITH_INCLUDES(
+    std_normal_lcdf_dscaled_impl,
+    opencl_kernels::std_normal_lcdf_device_function)
 ADD_UNARY_FUNCTION_WITH_INCLUDES(inv_Phi, opencl_kernels::log1m_device_function,
                                  opencl_kernels::phi_device_function,
                                  opencl_kernels::inv_phi_device_function)
@@ -352,10 +359,53 @@ ADD_BINARY_FUNCTION_WITH_INCLUDES(
     stan::math::opencl_kernels::lgamma_stirling_diff_device_function,
     stan::math::opencl_kernels::lbeta_device_function,
     stan::math::opencl_kernels::binomial_coefficient_log_device_function)
-ADD_BINARY_FUNCTION_WITH_INCLUDES(
-    lbeta, stan::math::opencl_kernels::lgamma_stirling_device_function,
+template <typename T1, typename T2>
+class lbeta_ : public elt_function_cl<lbeta_<T1, T2>, double, T1, T2> {
+  using base = elt_function_cl<lbeta_<T1, T2>, double, T1, T2>;
+  using base::arguments_;
+
+ public:
+  using base::rows;
+  using base::cols;
+  static const std::vector<const char*> includes;
+  explicit lbeta_(T1&& a, T2&& b)
+      : base("stan_lbeta", std::forward<T1>(a), std::forward<T2>(b)) {
+    if (a.rows() != base::dynamic && b.rows() != base::dynamic) {
+      check_size_match("lbeta", "Rows of ", "a", a.rows(), "rows of ", "b",
+                       b.rows());
+    }
+    if (a.cols() != base::dynamic && b.cols() != base::dynamic) {
+      check_size_match("lbeta", "Columns of ", "a", a.cols(), "columns of ",
+                       "b", b.cols());
+    }
+  }
+  inline auto deep_copy() const {
+    auto&& arg1_copy = this->template get_arg<0>().deep_copy();
+    auto&& arg2_copy = this->template get_arg<1>().deep_copy();
+    return lbeta_<std::remove_reference_t<decltype(arg1_copy)>,
+                  std::remove_reference_t<decltype(arg2_copy)>>{
+        std::move(arg1_copy), std::move(arg2_copy)};
+  }
+  inline std::pair<int, int> extreme_diagonals() const {
+    return {-rows() + 1, cols() - 1};
+  }
+};
+
+template <typename T1, typename T2,
+          require_all_kernel_expressions_t<T1, T2>* = nullptr,
+          require_any_not_stan_scalar_t<T1, T2>* = nullptr>
+inline lbeta_<as_operation_cl_t<T1>, as_operation_cl_t<T2>> lbeta(T1&& a,
+                                                                  T2&& b) {
+  return lbeta_<as_operation_cl_t<T1>, as_operation_cl_t<T2>>(
+      as_operation_cl(std::forward<T1>(a)),
+      as_operation_cl(std::forward<T2>(b)));
+}
+
+template <typename T1, typename T2>
+const std::vector<const char*> lbeta_<T1, T2>::includes{
+    stan::math::opencl_kernels::lgamma_stirling_device_function,
     stan::math::opencl_kernels::lgamma_stirling_diff_device_function,
-    stan::math::opencl_kernels::lbeta_device_function)
+    stan::math::opencl_kernels::lbeta_device_function};
 ADD_BINARY_FUNCTION_WITH_INCLUDES(
     log_inv_logit_diff, opencl_kernels::log1p_exp_device_function,
     opencl_kernels::log1m_exp_device_function,
 
@@ -95,7 +95,7 @@ static constexpr const char* binomial_coefficient_log_device_function
             } else if (n_plus_1 < LGAMMA_STIRLING_DIFF_USEFUL) {
               return lgamma(n_plus_1) - lgamma(k + 1) - lgamma(n_plus_1_mk);
             } else {
-              return -lbeta(n_plus_1_mk, k + 1) - log1p(n);
+              return -stan_lbeta(n_plus_1_mk, k + 1) - log1p(n);
             }
           }
           // \cond
 
@@ -59,9 +59,9 @@ static constexpr const char* lbeta_device_function
            * @param b Second value
            * @return Log of the beta function applied to the two values.
            */
-          double lbeta(double a, double b) {
+          double stan_lbeta(double a, double b) {
             if (isnan(a) || isnan(b)) {
-              return a;
+              return NAN;
             }
 
             double x;  // x is the smaller of the two
 
@@ -0,0 +1,168 @@
+#ifndef STAN_MATH_OPENCL_KERNELS_DEVICE_FUNCTIONS_STD_NORMAL_LCDF_HPP
+#define STAN_MATH_OPENCL_KERNELS_DEVICE_FUNCTIONS_STD_NORMAL_LCDF_HPP
+#ifdef STAN_OPENCL
+
+#include <stan/math/opencl/stringify.hpp>
+#include <string>
+
+namespace stan {
+namespace math {
+namespace opencl_kernels {
+// \cond
+static constexpr const char* std_normal_lcdf_device_function
+    = "\n"
+      "#ifndef STAN_MATH_OPENCL_KERNELS_DEVICE_FUNCTIONS_STD_NORMAL_LCDF\n"
+      "#define STAN_MATH_OPENCL_KERNELS_DEVICE_FUNCTIONS_STD_NORMAL_LCDF\n"
+      STRINGIFY(
+          /** \ingroup opencl_kernels
+           * Return the log standard normal cumulative distribution function
+           * evaluated from the scaled input `x / sqrt(2)`.
+           *
+           * @param scaled_y input scaled by `1 / sqrt(2)`
+           * @return log(Phi(x))
+           */
+          inline double std_normal_lcdf_scaled_impl(double scaled_y) {
+            double lcdf_n;
+            if (scaled_y > 0.0) {
+              // CDF(x) = 1/2 + 1/2 erf(x) = 1 - 1/2 erfc(x)
+              lcdf_n = log1p(-0.5 * erfc(scaled_y));
+              if (isnan(lcdf_n)) {
+                lcdf_n = 0;
+              }
+            } else if (scaled_y > -20.0) {
+              // CDF(x) = 1/2 - 1/2 erf(-x) = 1/2 erfc(-x)
+              lcdf_n = log(erfc(-scaled_y)) - M_LN2;
+            } else if (10.0 * log(fabs(scaled_y)) < log(DBL_MAX)) {
+              // Need direct approximation once erfc(-x) underflows.
+              const double x2 = scaled_y * scaled_y;
+              const double x4 = pow(scaled_y, 4);
+              const double x6 = pow(scaled_y, 6);
+              const double x8 = pow(scaled_y, 8);
+              const double x10 = pow(scaled_y, 10);
+              const double temp_p
+                  = 0.000658749161529837803157 + 0.0160837851487422766278 / x2
+                    + 0.125781726111229246204 / x4
+                    + 0.360344899949804439429 / x6
+                    + 0.305326634961232344035 / x8
+                    + 0.0163153871373020978498 / x10;
+              const double temp_q
+                  = -0.00233520497626869185443
+                    - 0.0605183413124413191178 / x2
+                    - 0.527905102951428412248 / x4
+                    - 1.87295284992346047209 / x6
+                    - 2.56852019228982242072 / x8 - 1.0 / x10;
+              lcdf_n = log(0.5 * M_2_SQRTPI + (temp_p / temp_q) / x2)
+                       - M_LN2 - log(-scaled_y) - x2;
+            } else {
+              lcdf_n = -INFINITY;
+            }
+            return lcdf_n;
+          }
+
+          /** \ingroup opencl_kernels
+           * Return the derivative of log standard normal cumulative
+           * distribution function with respect to the scaled input
+           * `x / sqrt(2)`.
+           *
+           * @param scaled_y input scaled by `1 / sqrt(2)`
+           * @return d / d(scaled_y) log(Phi(x))
+           */
+          inline double std_normal_lcdf_dscaled_impl(double scaled_y) {
+            double dnlcdf = 0.0;
+            double t = 0.0;
+            double t2 = 0.0;
+            double t4 = 0.0;
+            const double x2 = scaled_y * scaled_y;
+
+            if (scaled_y > 2.9) {
+              t = 1.0 / (1.0 + 0.3275911 * scaled_y);
+              t2 = t * t;
+              t4 = pow(t, 4);
+              dnlcdf
+                  = 0.5 * M_2_SQRTPI
+                    / (exp(x2) - 0.254829592 + 0.284496736 * t
+                       - 1.421413741 * t2 + 1.453152027 * t2 * t
+                       - 1.061405429 * t4);
+            } else if (scaled_y > 2.5) {
+              t = scaled_y - 2.7;
+              t2 = t * t;
+              t4 = pow(t, 4);
+              dnlcdf = 0.0003849882382 - 0.002079084702 * t
+                       + 0.005229340880 * t2 - 0.008029540137 * t2 * t
+                       + 0.008232190507 * t4 - 0.005692364250 * t4 * t
+                       + 0.002399496363 * pow(t, 6);
+            } else if (scaled_y > 2.1) {
+              t = scaled_y - 2.3;
+              t2 = t * t;
+              t4 = pow(t, 4);
+              dnlcdf = 0.002846135439 - 0.01310032351 * t
+                       + 0.02732189391 * t2 - 0.03326906904 * t2 * t
+                       + 0.02482478940 * t4 - 0.009883071924 * t4 * t
+                       - 0.0002771362254 * pow(t, 6);
+            } else if (scaled_y > 1.5) {
+              t = scaled_y - 1.85;
+              t2 = t * t;
+              t4 = pow(t, 4);
+              dnlcdf = 0.01849212058 - 0.06876280470 * t
+                       + 0.1099906382 * t2 - 0.09274533184 * t2 * t
+                       + 0.03543327418 * t4 + 0.005644855518 * t4 * t
+                       - 0.01111434424 * pow(t, 6);
+            } else if (scaled_y > 0.8) {
+              t = scaled_y - 1.15;
+              t2 = t * t;
+              t4 = pow(t, 4);
+              dnlcdf = 0.1585747034 - 0.3898677543 * t
+                       + 0.3515963775 * t2 - 0.09748053605 * t2 * t
+                       - 0.04347986191 * t4 + 0.02182506378 * t4 * t
+                       + 0.01074751427 * pow(t, 6);
+            } else if (scaled_y > 0.1) {
+              t = scaled_y - 0.45;
+              t2 = t * t;
+              t4 = pow(t, 4);
+              dnlcdf = 0.6245634904 - 0.9521866949 * t
+                       + 0.3986215682 * t2 + 0.04700850676 * t2 * t
+                       - 0.03478651979 * t4 - 0.01772675404 * t4 * t
+                       + 0.0006577254811 * pow(t, 6);
+            } else if (10.0 * log(fabs(scaled_y)) < log(DBL_MAX)) {
+              t = 1.0 / (1.0 - 0.3275911 * scaled_y);
+              t2 = t * t;
+              t4 = pow(t, 4);
+              dnlcdf
+                  = M_2_SQRTPI
+                    / (0.254829592 * t - 0.284496736 * t2
+                       + 1.421413741 * t2 * t - 1.453152027 * t4
+                       + 1.061405429 * t4 * t);
+              if (scaled_y < -29.0) {
+                dnlcdf += 0.0015065154280332 * x2 - 0.3993154819705530 * scaled_y
+                          - 4.2919418242931700;
+              } else if (scaled_y < -17.0) {
+                dnlcdf += 0.0001263257217272 * x2 * scaled_y
+                          + 0.0123586859488623 * x2
+                          - 0.0860505264736028 * scaled_y - 1.252783383752970;
+              } else if (scaled_y < -7.0) {
+                dnlcdf += 0.000471585349920831 * x2 * scaled_y
+                          + 0.0296839305424034 * x2
+                          + 0.207402143352332 * scaled_y + 0.425316974683324;
+              } else if (scaled_y < -3.9) {
+                dnlcdf += -0.0006972280656443 * x2 * scaled_y
+                          + 0.0068218494628567 * x2
+                          + 0.0585761964460277 * scaled_y + 0.1034397670201370;
+              } else if (scaled_y < -2.1) {
+                dnlcdf += -0.0018742199480885 * x2 * scaled_y
+                          - 0.0097119598291202 * x2
+                          - 0.0170137970924080 * scaled_y - 0.0100428567412041;
+              }
+            } else {
+              dnlcdf = INFINITY;
+            }
+
+            return dnlcdf;
+          }) "\n#endif\n";  // NOLINT
+// \endcond
+
+}  // namespace opencl_kernels
+}  // namespace math
+}  // namespace stan
+
+#endif
+#endif
@@ -8,6 +8,7 @@
 #include <stan/math/prim/fun/elt_divide.hpp>
 #include <stan/math/prim/fun/elt_multiply.hpp>
 #include <stan/math/opencl/kernel_generator.hpp>
+#include <stan/math/opencl/prim/std_normal_lcdf.hpp>
 #include <stan/math/prim/functor/partials_propagator.hpp>
 
 namespace stan {
@@ -80,30 +81,76 @@ exp_mod_normal_lcdf(const T_y_cl& y, const T_loc_cl& mu,
   auto scaled_diff = elt_multiply(diff * INV_SQRT_TWO, sigma_inv);
   auto v = elt_multiply(lambda_val, sigma_val);
   auto scaled_diff_diff = scaled_diff - v * INV_SQRT_TWO;
-  auto erf_calc = 0.5 * (1.0 + erf(scaled_diff_diff));
-  auto exp_term = exp(0.5 * square(v) - elt_multiply(lambda_val, diff));
-  auto cdf_n = 0.5 + 0.5 * erf(scaled_diff) - elt_multiply(exp_term, erf_calc);
-  auto cdf_log_expr = colwise_sum(log(cdf_n));
+  auto cdf_term_1 = 0.5 + 0.5 * erf(scaled_diff);
+  auto cdf_term_2_phi = 0.5 * (1.0 + erf(scaled_diff_diff));
+  auto log_exp_term = 0.5 * square(v) - elt_multiply(lambda_val, diff);
+  auto exp_term = exp(log_exp_term);
+  auto cdf_term_2 = elt_multiply(exp_term, cdf_term_2_phi);
+  auto cdf_n = cdf_term_1 - cdf_term_2;
+  auto use_stable = cdf_n <= 0.0 || !isfinite(cdf_n);
 
   auto exp_term_2 = exp(-square(scaled_diff_diff));
-  auto deriv_1 = elt_multiply(elt_multiply(lambda_val, exp_term), erf_calc);
+  auto deriv_1 = elt_multiply(elt_multiply(lambda_val, exp_term), cdf_term_2_phi);
   auto deriv_2 = INV_SQRT_TWO_PI
                  * elt_multiply(elt_multiply(exp_term, exp_term_2), sigma_inv);
   auto deriv_3
       = INV_SQRT_TWO_PI * elt_multiply(exp(-square(scaled_diff)), sigma_inv);
-  auto y_deriv = elt_divide(deriv_1 - deriv_2 + deriv_3, cdf_n);
-  auto mu_deriv = -y_deriv;
-  auto sigma_deriv = -elt_divide(
+  auto direct_cdf_log = log(cdf_n);
+  auto direct_y_deriv = elt_divide(deriv_1 - deriv_2 + deriv_3, cdf_n);
+  auto direct_mu_deriv = -direct_y_deriv;
+  auto direct_sigma_deriv = -elt_divide(
       elt_multiply(deriv_1 - deriv_2, v)
           + elt_multiply(deriv_3 - deriv_2, scaled_diff) * SQRT_TWO,
       cdf_n);
-  auto lambda_deriv = elt_divide(
+  auto direct_lambda_deriv = elt_divide(
       elt_multiply(
           exp_term,
           INV_SQRT_TWO_PI * elt_multiply(sigma_val, exp_term_2)
-              - elt_multiply(elt_multiply(v, sigma_val) - diff, erf_calc)),
+              - elt_multiply(elt_multiply(v, sigma_val) - diff, cdf_term_2_phi)),
       cdf_n);
 
+  auto log_cdf_term_1 = std_normal_lcdf_scaled_impl(scaled_diff);
+  auto dlog_cdf_term_1 = std_normal_lcdf_dscaled_impl(scaled_diff);
+  auto log_cdf_term_2_phi = std_normal_lcdf_scaled_impl(scaled_diff_diff);
+  auto dlog_cdf_term_2_phi = std_normal_lcdf_dscaled_impl(scaled_diff_diff);
+  auto log_cdf_term_2 = log_exp_term + log_cdf_term_2_phi;
+  auto log_cdf_n = log_diff_exp(log_cdf_term_1, log_cdf_term_2);
+  auto cdf_term_1_weight = exp(log_cdf_term_1 - log_cdf_n);
+  auto cdf_term_2_weight = exp(log_cdf_term_2 - log_cdf_n);
+  auto scaled_diff_deriv
+      = elt_multiply(dlog_cdf_term_1, sigma_inv * INV_SQRT_TWO);
+  auto scaled_diff_diff_deriv
+      = elt_multiply(dlog_cdf_term_2_phi, sigma_inv * INV_SQRT_TWO);
+  auto stable_y_deriv = elt_multiply(cdf_term_1_weight, scaled_diff_deriv)
+                        - elt_multiply(cdf_term_2_weight,
+                                       -lambda_val + scaled_diff_diff_deriv);
+  auto stable_mu_deriv = -stable_y_deriv;
+  auto stable_sigma_deriv = elt_multiply(
+                                cdf_term_1_weight,
+                                -elt_multiply(dlog_cdf_term_1,
+                                              elt_multiply(scaled_diff,
+                                                           sigma_inv)))
+                            - elt_multiply(
+                                cdf_term_2_weight,
+                                elt_multiply(lambda_val, v)
+                                    - elt_multiply(
+                                        dlog_cdf_term_2_phi,
+                                        elt_multiply(
+                                            scaled_diff + v * INV_SQRT_TWO,
+                                            sigma_inv)));
+  auto stable_lambda_deriv
+      = -elt_multiply(cdf_term_2_weight,
+                      elt_multiply(v, sigma_val) - diff
+                          - elt_multiply(dlog_cdf_term_2_phi,
+                                         sigma_val * INV_SQRT_TWO));
+  auto cdf_log_expr = colwise_sum(select(use_stable, log_cdf_n, direct_cdf_log));
+  auto y_deriv = select(use_stable, stable_y_deriv, direct_y_deriv);
+  auto mu_deriv = select(use_stable, stable_mu_deriv, direct_mu_deriv);
+  auto sigma_deriv
+      = select(use_stable, stable_sigma_deriv, direct_sigma_deriv);
+  auto lambda_deriv
+      = select(use_stable, stable_lambda_deriv, direct_lambda_deriv);
+
   matrix_cl<char> any_y_neg_inf_cl;
   matrix_cl<char> any_y_pos_inf_cl;
   matrix_cl<double> cdf_log_cl;
 
@@ -67,8 +67,7 @@ inline return_type_t<T_y_cl, T_loc_cl, T_scale_cl> gumbel_cdf(
   auto exp_m_scaled_diff = exp(-scaled_diff);
   auto cdf_n = exp(-exp_m_scaled_diff);
   auto cdf_expr = colwise_prod(cdf_n);
-  auto rep_deriv = elt_divide(exp(-scaled_diff - exp_m_scaled_diff),
-                              elt_multiply(beta_val, cdf_n));
+  auto rep_deriv = elt_divide(exp_m_scaled_diff, beta_val);
 
   matrix_cl<double> cdf_cl;
   matrix_cl<double> y_deriv_cl;
Original file line number	Diff line number	Diff line change
`@@ -95,7 +95,7 @@ static constexpr const char* binomial_coefficient_log_device_function`
`95`	`95`	`} else if (n_plus_1 < LGAMMA_STIRLING_DIFF_USEFUL) {`
`96`	`96`	`return lgamma(n_plus_1) - lgamma(k + 1) - lgamma(n_plus_1_mk);`
`97`	`97`	`} else {`
`98`		`- return -lbeta(n_plus_1_mk, k + 1) - log1p(n);`
	`98`	`+ return -stan_lbeta(n_plus_1_mk, k + 1) - log1p(n);`
`99`	`99`	`}`
`100`	`100`	`}`
`101`	`101`	`// \cond`