pytorch
diff --git a/‎.ci/scripts/setup-macos.sh‎
Lines changed: 9 additions & 3 deletions b/‎.ci/scripts/setup-macos.sh‎
Lines changed: 9 additions & 3 deletions
diff --git a/‎.ci/scripts/test_lora.sh‎
Lines changed: 2 additions & 1 deletion b/‎.ci/scripts/test_lora.sh‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎.ci/scripts/utils.sh‎
Lines changed: 4 additions & 0 deletions b/‎.ci/scripts/utils.sh‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎backends/aoti/slim/cuda/test/targets.bzl‎
Lines changed: 3 additions & 2 deletions b/‎backends/aoti/slim/cuda/test/targets.bzl‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎backends/arm/test/ops/test_sum.py‎
Lines changed: 70 additions & 2 deletions b/‎backends/arm/test/ops/test_sum.py‎
Lines changed: 70 additions & 2 deletions
diff --git a/‎backends/arm/test/targets.bzl‎
Lines changed: 2 additions & 1 deletion b/‎backends/arm/test/targets.bzl‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎backends/cuda/tests/test_fused_moe.py‎
Lines changed: 5 additions & 0 deletions b/‎backends/cuda/tests/test_fused_moe.py‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎backends/cuda/triton/kernels/__init__.py‎
Lines changed: 2 additions & 0 deletions b/‎backends/cuda/triton/kernels/__init__.py‎
Lines changed: 2 additions & 0 deletions
@@ -116,7 +116,6 @@ setup_macos_env_variables
 # buck2 atm
 install_buck
 brew install libomp
-install_pip_dependencies
 
 # TODO(huydhn): Unlike our self-hosted runner, GitHub runner doesn't have access
 # to our infra, so compiler caching needs to be setup differently using GitHub
@@ -125,10 +124,17 @@ if [[ -z "${GITHUB_RUNNER:-}" ]]; then
   install_sccache
 fi
 
+# Install pinned torch before requirements-ci.txt so torchsr's transitive
+# torch dep is satisfied by the existing install and pip does not pull a
+# separate copy from PyPI. sccache is initialized above so source-build
+# cache misses still hit the cache.
 print_cmake_info
 install_pytorch_and_domains
-# We build PyTorch from source here instead of using nightly. This allows CI to test against
-# the pinned commit from PyTorch
+
+install_pip_dependencies
+
+# install_executorch's --use-pt-pinned-commit skips re-installing torch since
+# install_pytorch_and_domains already installed the pinned build above.
 if [[ "$EDITABLE" == "true" ]]; then
   install_executorch --use-pt-pinned-commit --editable
 else
 
@@ -159,7 +159,8 @@ Okay, so I need to calculate 15% of 80."
 EXPECTED_QUANT_LORA_PREFIX="
 <|im_start|>user Calculate 15% of 80?<|im_end|><|im_start|>assistant
 To calculate 15% of 80, we can multiply 80 by 15/100.
-So, 15% of 80 is equal to (80 * 15) / 100 = 1200 / 100 = 12.
+80 * 15/100 = 12.
+So, 15% of 80 is 12.
 #### 12
 The answer is: 12<|im_end|>"
 EXPECTED_QUANT_LORA_ALTERNATE_PREFIX="
 
@@ -127,6 +127,10 @@ install_pytorch_and_domains() {
   if [[ "${torch_wheel_not_found}" == "1" ]]; then
     echo "No cached wheel found, continue with building PyTorch at ${TORCH_VERSION}"
 
+    # Install PyTorch's own build-time deps so the source build does not
+    # silently inherit them from whatever else happens to be in the env
+    # (e.g. executorch's requirements-ci.txt).
+    pip install -r requirements-build.txt
     git submodule update --init --recursive
     USE_DISTRIBUTED=1 python setup.py bdist_wheel
     pip install "$(echo dist/*.whl)"
 
@@ -1,8 +1,8 @@
-load("@fbcode_macros//build_defs:cpp_unittest.bzl", "cpp_unittest")
+load("@fbcode_macros//build_defs:gpu_cpp_unittest.bzl", "gpu_cpp_unittest")
 load("@fbcode_macros//build_defs/lib:re_test_utils.bzl", "re_test_utils")
 
 def cuda_slim_cpp_unittest(name):
-    cpp_unittest(
+    gpu_cpp_unittest(
         name = "test_" + name,
         srcs = [
             "test_" + name + ".cpp",
@@ -16,6 +16,7 @@ def cuda_slim_cpp_unittest(name):
         external_deps = [
             ("cuda", None, "cuda-lazy"),
         ],
+        hip_compatible = False,
         keep_gpu_sections = True,
         remote_execution = re_test_utils.remote_execution(
             platform = "gpu-remote-execution",
 
@@ -5,6 +5,8 @@
 
 from typing import Callable, Tuple
 
+import pytest
+
 import torch
 from executorch.backends.arm.test import common
 
@@ -96,7 +98,16 @@ def test_sum_dim_intlist_tosa_INT(test_data: input_t1):
     pipeline.run()
 
 
-@common.parametrize("test_data", Sum.test_parameters)
+# dim=None cases skipped: executorch.devtools.bundled_program.config rejects
+# None as a model input (cannot be serialized into the bundled program).
+_DIM_NONE_SKIP_REASON = "bundled_program cannot serialize None as a model input"
+_dim_none_skips = {
+    "dim_None": _DIM_NONE_SKIP_REASON,
+    "dim_None_4d_tensor": _DIM_NONE_SKIP_REASON,
+}
+
+
+@common.parametrize("test_data", Sum.test_parameters, skips=_dim_none_skips)
 @common.XfailIfNoCorstone300
 def test_sum_u55_INT_1_0(test_data: Tuple):
     pipeline = EthosU55PipelineINT[input_t1](
@@ -108,7 +119,7 @@ def test_sum_u55_INT_1_0(test_data: Tuple):
     pipeline.run()
 
 
-@common.parametrize("test_data", Sum.test_parameters)
+@common.parametrize("test_data", Sum.test_parameters, skips=_dim_none_skips)
 @common.XfailIfNoCorstone320
 def test_sum_u85_INT_1_0(test_data: Tuple):
     pipeline = EthosU85PipelineINT[input_t1](
@@ -220,3 +231,60 @@ def test_sum_tosa_FP(test_data: Callable[[], input_t2]):
 def test_sum_tosa_INT(test_data: Callable[[], input_t2]):
     pipeline = TosaPipelineINT[input_t1](SumDefault(), test_data(), SumDefault.aten_op)
     pipeline.run()
+
+
+# a16w8 (int16 IO + int8 weights) coverage for sum.dim_IntList. Surfaces the
+# Ethos-U85 int16 ReduceSum silent-zero issue tracked upstream at
+# https://gitlab.arm.com/artificial-intelligence/ethos-u/ethos-u-vela/-/issues/23.
+
+
+class SumLastDim(torch.nn.Module):
+    """Reduce the last dim with keepdim=True."""
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return x.sum(dim=-1, keepdim=True)
+
+
+a16w8_sum_test_parameters = {
+    "rank1_16": lambda: (torch.rand(16),),
+    "rank3_8x1x16": lambda: (torch.rand(8, 1, 16),),
+    "rank3_4x4x16": lambda: (torch.rand(4, 4, 16),),
+}
+
+
+@common.parametrize("test_data", a16w8_sum_test_parameters)
+@common.XfailIfNoCorstone300
+def test_sum_dim_intlist_a16w8_u55_INT(test_data: Callable[[], input_t1]):
+    pipeline = EthosU55PipelineINT[input_t1](
+        SumLastDim(),
+        test_data(),
+        aten_op,
+        exir_ops=[],
+        a16w8_quantization=True,
+        symmetric_io_quantization=True,
+        qtol=128,
+        epsilon=2**-16,
+    )
+    pipeline.run()
+
+
+# All cases hit upstream Vela issue #23 (linked above). strict=False so the
+# test target stays green both on stock Vela 5.0 (cases XFAIL) and once the
+# Vela fix is in tree (cases XPASS).
+@common.parametrize("test_data", a16w8_sum_test_parameters)
+@common.XfailIfNoCorstone320
+@pytest.mark.xfail(
+    reason="Ethos-U85 int16 ReduceSum returns zero (vela#23)", strict=False
+)
+def test_sum_dim_intlist_a16w8_u85_INT(test_data: Callable[[], input_t1]):
+    pipeline = EthosU85PipelineINT[input_t1](
+        SumLastDim(),
+        test_data(),
+        aten_op,
+        exir_ops=[],
+        a16w8_quantization=True,
+        symmetric_io_quantization=True,
+        qtol=128,
+        epsilon=2**-16,
+    )
+    pipeline.run()
@@ -3,7 +3,7 @@ load("@fbcode_macros//build_defs:python_pytest.bzl", "python_pytest")
 load("@bazel_skylib//lib:paths.bzl", "paths")
 load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
 
-_ENABLE_VGF = False
+_ENABLE_VGF = True
 
 def define_arm_tests():
     # TODO [fbonly] Add more tests
@@ -30,6 +30,7 @@ def define_arm_tests():
         "ops/test_slice.py",
         "ops/test_sigmoid.py",
         "ops/test_sub.py",
+        "ops/test_sum.py",
         "ops/test_tanh.py",
         "ops/test_view.py",
         "ops/test_cos.py",
 
@@ -503,6 +503,11 @@ class TestFusedMoEBatchedInt8(unittest.TestCase):
         (55, 64, 64, 32, 4, 2, 32, "64tok"),
         (99, 128, 128, 64, 8, 2, 32, "128tok"),
         (0, 256, 128, 64, 8, 2, 32, "256tok"),
+        # Realistic-scale configs to catch precision/alignment issues with
+        # K > PREQUANT_BLOCK_K (matches Qwen3.5-MoE shapes: hidden=2048,
+        # intermediate=1024, num_experts=8, top_k=2, group_size=128).
+        (77, 512, 2048, 1024, 8, 2, 128, "512tok_real_dims"),
+        (21, 1, 2048, 1024, 8, 2, 128, "1tok_decode"),
     ]
 
     def test_int8_correctness(self):
 
@@ -8,6 +8,7 @@
     fused_moe,
     fused_moe_batched,
     fused_moe_batched_gemm,
+    fused_moe_batched_gemm_int8,
     moe_align_block_size,
 )
 
@@ -23,6 +24,7 @@
     "fused_moe",
     "fused_moe_batched",
     "fused_moe_batched_gemm",
+    "fused_moe_batched_gemm_int8",
     "int4_matvec",
     "moe_align_block_size",
     "sdpa",