Skip to content

Commit b448787

Browse files
authored
Merge branch 'main' into docathon/contributing-cpp-tests
2 parents 70bd86c + 93b764e commit b448787

24 files changed

Lines changed: 503 additions & 127 deletions

File tree

.ci/scripts/setup-macos.sh

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,6 @@ setup_macos_env_variables
116116
# buck2 atm
117117
install_buck
118118
brew install libomp
119-
install_pip_dependencies
120119

121120
# TODO(huydhn): Unlike our self-hosted runner, GitHub runner doesn't have access
122121
# to our infra, so compiler caching needs to be setup differently using GitHub
@@ -125,10 +124,17 @@ if [[ -z "${GITHUB_RUNNER:-}" ]]; then
125124
install_sccache
126125
fi
127126

127+
# Install pinned torch before requirements-ci.txt so torchsr's transitive
128+
# torch dep is satisfied by the existing install and pip does not pull a
129+
# separate copy from PyPI. sccache is initialized above so source-build
130+
# cache misses still hit the cache.
128131
print_cmake_info
129132
install_pytorch_and_domains
130-
# We build PyTorch from source here instead of using nightly. This allows CI to test against
131-
# the pinned commit from PyTorch
133+
134+
install_pip_dependencies
135+
136+
# install_executorch's --use-pt-pinned-commit skips re-installing torch since
137+
# install_pytorch_and_domains already installed the pinned build above.
132138
if [[ "$EDITABLE" == "true" ]]; then
133139
install_executorch --use-pt-pinned-commit --editable
134140
else

.ci/scripts/test_lora.sh

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -159,7 +159,8 @@ Okay, so I need to calculate 15% of 80."
159159
EXPECTED_QUANT_LORA_PREFIX="
160160
<|im_start|>user Calculate 15% of 80?<|im_end|><|im_start|>assistant
161161
To calculate 15% of 80, we can multiply 80 by 15/100.
162-
So, 15% of 80 is equal to (80 * 15) / 100 = 1200 / 100 = 12.
162+
80 * 15/100 = 12.
163+
So, 15% of 80 is 12.
163164
#### 12
164165
The answer is: 12<|im_end|>"
165166
EXPECTED_QUANT_LORA_ALTERNATE_PREFIX="

.ci/scripts/utils.sh

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,10 @@ install_pytorch_and_domains() {
127127
if [[ "${torch_wheel_not_found}" == "1" ]]; then
128128
echo "No cached wheel found, continue with building PyTorch at ${TORCH_VERSION}"
129129

130+
# Install PyTorch's own build-time deps so the source build does not
131+
# silently inherit them from whatever else happens to be in the env
132+
# (e.g. executorch's requirements-ci.txt).
133+
pip install -r requirements-build.txt
130134
git submodule update --init --recursive
131135
USE_DISTRIBUTED=1 python setup.py bdist_wheel
132136
pip install "$(echo dist/*.whl)"

backends/aoti/slim/cuda/test/targets.bzl

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
1-
load("@fbcode_macros//build_defs:cpp_unittest.bzl", "cpp_unittest")
1+
load("@fbcode_macros//build_defs:gpu_cpp_unittest.bzl", "gpu_cpp_unittest")
22
load("@fbcode_macros//build_defs/lib:re_test_utils.bzl", "re_test_utils")
33

44
def cuda_slim_cpp_unittest(name):
5-
cpp_unittest(
5+
gpu_cpp_unittest(
66
name = "test_" + name,
77
srcs = [
88
"test_" + name + ".cpp",
@@ -16,6 +16,7 @@ def cuda_slim_cpp_unittest(name):
1616
external_deps = [
1717
("cuda", None, "cuda-lazy"),
1818
],
19+
hip_compatible = False,
1920
keep_gpu_sections = True,
2021
remote_execution = re_test_utils.remote_execution(
2122
platform = "gpu-remote-execution",

backends/arm/test/ops/test_sum.py

Lines changed: 70 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55

66
from typing import Callable, Tuple
77

8+
import pytest
9+
810
import torch
911
from executorch.backends.arm.test import common
1012

@@ -96,7 +98,16 @@ def test_sum_dim_intlist_tosa_INT(test_data: input_t1):
9698
pipeline.run()
9799

98100

99-
@common.parametrize("test_data", Sum.test_parameters)
101+
# dim=None cases skipped: executorch.devtools.bundled_program.config rejects
102+
# None as a model input (cannot be serialized into the bundled program).
103+
_DIM_NONE_SKIP_REASON = "bundled_program cannot serialize None as a model input"
104+
_dim_none_skips = {
105+
"dim_None": _DIM_NONE_SKIP_REASON,
106+
"dim_None_4d_tensor": _DIM_NONE_SKIP_REASON,
107+
}
108+
109+
110+
@common.parametrize("test_data", Sum.test_parameters, skips=_dim_none_skips)
100111
@common.XfailIfNoCorstone300
101112
def test_sum_u55_INT_1_0(test_data: Tuple):
102113
pipeline = EthosU55PipelineINT[input_t1](
@@ -108,7 +119,7 @@ def test_sum_u55_INT_1_0(test_data: Tuple):
108119
pipeline.run()
109120

110121

111-
@common.parametrize("test_data", Sum.test_parameters)
122+
@common.parametrize("test_data", Sum.test_parameters, skips=_dim_none_skips)
112123
@common.XfailIfNoCorstone320
113124
def test_sum_u85_INT_1_0(test_data: Tuple):
114125
pipeline = EthosU85PipelineINT[input_t1](
@@ -220,3 +231,60 @@ def test_sum_tosa_FP(test_data: Callable[[], input_t2]):
220231
def test_sum_tosa_INT(test_data: Callable[[], input_t2]):
221232
pipeline = TosaPipelineINT[input_t1](SumDefault(), test_data(), SumDefault.aten_op)
222233
pipeline.run()
234+
235+
236+
# a16w8 (int16 IO + int8 weights) coverage for sum.dim_IntList. Surfaces the
237+
# Ethos-U85 int16 ReduceSum silent-zero issue tracked upstream at
238+
# https://gitlab.arm.com/artificial-intelligence/ethos-u/ethos-u-vela/-/issues/23.
239+
240+
241+
class SumLastDim(torch.nn.Module):
242+
"""Reduce the last dim with keepdim=True."""
243+
244+
def forward(self, x: torch.Tensor) -> torch.Tensor:
245+
return x.sum(dim=-1, keepdim=True)
246+
247+
248+
a16w8_sum_test_parameters = {
249+
"rank1_16": lambda: (torch.rand(16),),
250+
"rank3_8x1x16": lambda: (torch.rand(8, 1, 16),),
251+
"rank3_4x4x16": lambda: (torch.rand(4, 4, 16),),
252+
}
253+
254+
255+
@common.parametrize("test_data", a16w8_sum_test_parameters)
256+
@common.XfailIfNoCorstone300
257+
def test_sum_dim_intlist_a16w8_u55_INT(test_data: Callable[[], input_t1]):
258+
pipeline = EthosU55PipelineINT[input_t1](
259+
SumLastDim(),
260+
test_data(),
261+
aten_op,
262+
exir_ops=[],
263+
a16w8_quantization=True,
264+
symmetric_io_quantization=True,
265+
qtol=128,
266+
epsilon=2**-16,
267+
)
268+
pipeline.run()
269+
270+
271+
# All cases hit upstream Vela issue #23 (linked above). strict=False so the
272+
# test target stays green both on stock Vela 5.0 (cases XFAIL) and once the
273+
# Vela fix is in tree (cases XPASS).
274+
@common.parametrize("test_data", a16w8_sum_test_parameters)
275+
@common.XfailIfNoCorstone320
276+
@pytest.mark.xfail(
277+
reason="Ethos-U85 int16 ReduceSum returns zero (vela#23)", strict=False
278+
)
279+
def test_sum_dim_intlist_a16w8_u85_INT(test_data: Callable[[], input_t1]):
280+
pipeline = EthosU85PipelineINT[input_t1](
281+
SumLastDim(),
282+
test_data(),
283+
aten_op,
284+
exir_ops=[],
285+
a16w8_quantization=True,
286+
symmetric_io_quantization=True,
287+
qtol=128,
288+
epsilon=2**-16,
289+
)
290+
pipeline.run()

backends/arm/test/targets.bzl

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ load("@fbcode_macros//build_defs:python_pytest.bzl", "python_pytest")
33
load("@bazel_skylib//lib:paths.bzl", "paths")
44
load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
55

6-
_ENABLE_VGF = False
6+
_ENABLE_VGF = True
77

88
def define_arm_tests():
99
# TODO [fbonly] Add more tests
@@ -30,6 +30,7 @@ def define_arm_tests():
3030
"ops/test_slice.py",
3131
"ops/test_sigmoid.py",
3232
"ops/test_sub.py",
33+
"ops/test_sum.py",
3334
"ops/test_tanh.py",
3435
"ops/test_view.py",
3536
"ops/test_cos.py",

backends/cuda/tests/test_fused_moe.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -503,6 +503,11 @@ class TestFusedMoEBatchedInt8(unittest.TestCase):
503503
(55, 64, 64, 32, 4, 2, 32, "64tok"),
504504
(99, 128, 128, 64, 8, 2, 32, "128tok"),
505505
(0, 256, 128, 64, 8, 2, 32, "256tok"),
506+
# Realistic-scale configs to catch precision/alignment issues with
507+
# K > PREQUANT_BLOCK_K (matches Qwen3.5-MoE shapes: hidden=2048,
508+
# intermediate=1024, num_experts=8, top_k=2, group_size=128).
509+
(77, 512, 2048, 1024, 8, 2, 128, "512tok_real_dims"),
510+
(21, 1, 2048, 1024, 8, 2, 128, "1tok_decode"),
506511
]
507512

508513
def test_int8_correctness(self):

backends/cuda/triton/kernels/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
fused_moe,
99
fused_moe_batched,
1010
fused_moe_batched_gemm,
11+
fused_moe_batched_gemm_int8,
1112
moe_align_block_size,
1213
)
1314

@@ -23,6 +24,7 @@
2324
"fused_moe",
2425
"fused_moe_batched",
2526
"fused_moe_batched_gemm",
27+
"fused_moe_batched_gemm_int8",
2628
"int4_matvec",
2729
"moe_align_block_size",
2830
"sdpa",

0 commit comments

Comments
 (0)