pytorch
diff --git a/‎kernels/aten/functions.yaml‎
Lines changed: 2 additions & 0 deletions b/‎kernels/aten/functions.yaml‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎kernels/portable/cpu/op_var_mean.cpp‎
Lines changed: 125 additions & 0 deletions b/‎kernels/portable/cpu/op_var_mean.cpp‎
Lines changed: 125 additions & 0 deletions
diff --git a/‎kernels/portable/functions.yaml‎
Lines changed: 5 additions & 0 deletions b/‎kernels/portable/functions.yaml‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎kernels/test/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions b/‎kernels/test/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions
@@ -433,6 +433,8 @@
 
 - op: var.correction_out
 
+- op: var_mean.correction_out
+
 - op: var.out
 
 - op: view_as_real_copy.out
 
@@ -0,0 +1,125 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <c10/util/irange.h>
+#include <cmath>
+
+#include <executorch/kernels/portable/cpu/scalar_utils.h>
+#include <executorch/kernels/portable/cpu/util/reduce_util.h>
+#include <executorch/runtime/kernel/kernel_includes.h>
+
+namespace torch {
+namespace executor {
+namespace native {
+namespace {
+
+template <typename CTYPE_IN, typename CTYPE_OUT>
+void compute_var_mean(
+    KernelRuntimeContext& ctx,
+    const Tensor& in,
+    Tensor& var_out,
+    Tensor& mean_out,
+    optional<ArrayRef<int64_t>> dim_list,
+    const size_t num,
+    const double denominator) {
+  CTYPE_OUT* var_data = var_out.mutable_data_ptr<CTYPE_OUT>();
+  CTYPE_OUT* mean_data = mean_out.mutable_data_ptr<CTYPE_OUT>();
+  if (num == 0 || denominator <= 0) {
+    for (const auto out_ix : c10::irange(var_out.numel())) {
+      var_data[out_ix] = NAN;
+      mean_data[out_ix] = NAN;
+    }
+  } else if (in.numel() > 0) {
+    MapReduceOverDimListPlan plan(in, dim_list);
+    const bool success = parallel_for_each_reduce_over_dim_list_output_index(
+        in, dim_list, var_out, [&](const auto begin, const auto end) {
+          for (const auto out_ix : c10::irange(begin, end)) {
+            // Pass 1: compute sum -> mean
+            CTYPE_OUT sum = plan.execute<CTYPE_IN, CTYPE_OUT>(
+                [](CTYPE_IN v) { return static_cast<CTYPE_OUT>(v); },
+                [](CTYPE_OUT outv, CTYPE_OUT acc) { return acc + outv; },
+                out_ix);
+            CTYPE_OUT mean = sum / static_cast<CTYPE_OUT>(num);
+            mean_data[out_ix] = mean;
+            // Pass 2: compute sum of squared deviations
+            CTYPE_OUT sum2 = plan.execute<CTYPE_IN, CTYPE_OUT>(
+                [mean](CTYPE_IN v) {
+                  return (
+                      (static_cast<CTYPE_OUT>(v) - mean) *
+                      (static_cast<CTYPE_OUT>(v) - mean));
+                },
+                [](CTYPE_OUT outv, CTYPE_OUT acc) { return acc + outv; },
+                out_ix);
+            var_data[out_ix] = sum2 / denominator;
+          }
+        });
+    ET_KERNEL_CHECK_MSG(ctx, success, Internal, , "parallel_for failed");
+  }
+}
+
+} // namespace
+
+std::tuple<Tensor&, Tensor&> var_mean_correction_out(
+    KernelRuntimeContext& ctx,
+    const Tensor& in,
+    optional<ArrayRef<int64_t>> dim_list,
+    const optional<Scalar>& correction,
+    bool keepdim,
+    Tensor& out0,
+    Tensor& out1) {
+  (void)ctx;
+
+  std::tuple<Tensor&, Tensor&> ret_val(out0, out1);
+
+  ET_KERNEL_CHECK(
+      ctx,
+      check_reduction_args(in, dim_list, keepdim, {}, out0),
+      InvalidArgument,
+      ret_val);
+
+  ET_KERNEL_CHECK(
+      ctx,
+      check_reduction_args(in, dim_list, keepdim, {}, out1),
+      InvalidArgument,
+      ret_val);
+
+  ET_KERNEL_CHECK(
+      ctx,
+      resize_reduction_out(in, dim_list, keepdim, out0) == Error::Ok,
+      InvalidArgument,
+      ret_val);
+
+  ET_KERNEL_CHECK(
+      ctx,
+      resize_reduction_out(in, dim_list, keepdim, out1) == Error::Ok,
+      InvalidArgument,
+      ret_val);
+
+  static constexpr auto name = "var_mean.correction_out";
+
+  double correction_val = 1;
+  if (correction.has_value()) {
+    correction_val = utils::scalar_to<double>(correction.value());
+  }
+
+  const size_t num = get_reduced_dim_product(in, dim_list);
+  const double denom = num - correction_val;
+
+  ET_SWITCH_FLOATHBF16_TYPES(in.scalar_type(), ctx, name, CTYPE_IN, [&] {
+    ET_SWITCH_FLOATHBF16_TYPES(out0.scalar_type(), ctx, name, CTYPE_OUT, [&] {
+      compute_var_mean<CTYPE_IN, CTYPE_OUT>(
+          ctx, in, out0, out1, dim_list, num, denom);
+    });
+  });
+
+  return ret_val;
+}
+
+} // namespace native
+} // namespace executor
+} // namespace torch
@@ -1015,6 +1015,11 @@
     - arg_meta: null
       kernel_name: torch::executor::var_correction_out
 
+- op: var_mean.correction_out
+  kernels:
+    - arg_meta: null
+      kernel_name: torch::executor::var_mean_correction_out
+
 - op: var.out
   kernels:
     - arg_meta: null
 
@@ -311,6 +311,7 @@ set(all_test_sources
     "op_upsample_bilinear2d_aa_test.cpp"
     "op_upsample_nearest2d_test.cpp"
     "op_var_test.cpp"
+    "op_var_mean_test.cpp"
     "op_view_as_real_copy_test.cpp"
     "op_view_copy_test.cpp"
     "op_where_test.cpp"