Skip to content

Commit 66bf53e

Browse files
author
ssjia
committed
Update on "[ET-VK][ez] Implement helper functions to get fastest moving dim"
Add C++ and GLSL helpers to query the fastest moving dimension (the dimension with stride 1 in buffer layout). This is useful for optimizing memory access patterns in shaders, as iterating along the fastest moving dimension maximizes cache locality. The C++ `fastest_whcn_dim()` method accounts for block-transposed layouts by returning `outer_packed_dim` instead of `packed_dim` when applicable. A corresponding GLSL macro extracts this info from the hashed layout. Differential Revision: [D92061369](https://our.internmc.facebook.com/intern/diff/D92061369/) [ghstack-poisoned]
2 parents ac504c8 + 6755484 commit 66bf53e

80 files changed

Lines changed: 1462 additions & 464 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.ci/scripts/export_model_artifact.sh

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -157,10 +157,11 @@ pip list
157157
if [ "$MODEL_NAME" = "parakeet" ]; then
158158
pip install -r examples/models/parakeet/install_requirements.txt
159159

160-
python examples/models/parakeet/export_parakeet_tdt.py \
160+
python -m executorch.examples.models.parakeet.export_parakeet_tdt \
161161
--backend "$DEVICE" \
162162
--output-dir "${OUTPUT_DIR}" \
163-
--dtype bf16
163+
--dtype bf16 \
164+
${EXTRA_ARGS}
164165

165166
test -f "${OUTPUT_DIR}/model.pte"
166167
# CUDA saves named data to separate .ptd file, Metal embeds in .pte

.github/workflows/cuda.yml

Lines changed: 0 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -150,15 +150,6 @@ jobs:
150150
repo: "google"
151151
name: "gemma-3-4b-it"
152152
quant: "quantized-int4-weight-only"
153-
# Parakeet only supports non-quantized
154-
- model:
155-
repo: "nvidia"
156-
name: "parakeet-tdt"
157-
quant: "quantized-int4-tile-packed"
158-
- model:
159-
repo: "nvidia"
160-
name: "parakeet-tdt"
161-
quant: "quantized-int4-weight-only"
162153
with:
163154
timeout: 90
164155
secrets-env: EXECUTORCH_HF_TOKEN
@@ -219,15 +210,6 @@ jobs:
219210
repo: "google"
220211
name: "gemma-3-4b-it"
221212
quant: "quantized-int4-weight-only"
222-
# Parakeet only supports non-quantized
223-
- model:
224-
repo: "nvidia"
225-
name: "parakeet-tdt"
226-
quant: "quantized-int4-tile-packed"
227-
- model:
228-
repo: "nvidia"
229-
name: "parakeet-tdt"
230-
quant: "quantized-int4-weight-only"
231213
with:
232214
timeout: 90
233215
runner: linux.g5.4xlarge.nvidia.gpu

.github/workflows/trunk.yml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -315,7 +315,6 @@ jobs:
315315
- test_arm_baremetal: test_run_ethos_u85
316316
- test_arm_baremetal: test_smaller_stories_llama
317317
- test_arm_baremetal: test_memory_allocation
318-
- test_arm_baremetal: test_model_smollm2-135M
319318
fail-fast: false
320319
with:
321320
runner: linux.2xlarge.memory

.lintrunner.toml

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -503,3 +503,17 @@ command = [
503503
'--',
504504
'@{{PATHSFILE}}',
505505
]
506+
507+
[[linter]]
508+
code = 'DOCFORMATTER'
509+
include_patterns = []
510+
exclude_patterns = ['**']
511+
command = [
512+
'python','-m','lintrunner_adapters','run','docformatter_linter','--config=pyproject.toml','--','@{{PATHSFILE}}'
513+
]
514+
init_command = [
515+
'python','-m','lintrunner_adapters','run','pip_init',
516+
'--dry-run={{DRYRUN}}',
517+
'--requirement=requirements-lintrunner.txt',
518+
]
519+
is_formatter = true

backends/aoti/slim/c10/core/ScalarType.h

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,24 @@ inline bool isBoolType(ScalarType t) {
133133
return t == ScalarType::Bool;
134134
}
135135

136+
/// Checks if the scalar type is a valid/supported type.
137+
/// @param t The scalar type to check.
138+
/// @return true if the scalar type is valid, false otherwise.
139+
inline bool isValidScalarType(ScalarType t) {
140+
switch (t) {
141+
case ScalarType::Char:
142+
case ScalarType::Short:
143+
case ScalarType::Int:
144+
case ScalarType::Long:
145+
case ScalarType::Float:
146+
case ScalarType::Bool:
147+
case ScalarType::BFloat16:
148+
return true;
149+
default:
150+
return false;
151+
}
152+
}
153+
136154
inline std::ostream& operator<<(std::ostream& stream, ScalarType scalar_type) {
137155
return stream << toString(scalar_type);
138156
}

backends/aoti/slim/core/slim_tensor.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ class SlimTensor {
5252
storage_offset_(storage_offset),
5353
dtype_(dtype) {
5454
set_sizes_and_strides(sizes, strides);
55+
check_supportive();
5556
}
5657

5758
/**
@@ -65,6 +66,7 @@ class SlimTensor {
6566
is_contiguous_(true) {
6667
sizes_and_strides_.set_sizes({0});
6768
sizes_and_strides_.set_strides({1});
69+
check_supportive();
6870
}
6971

7072
// Default copy/move operations
@@ -556,6 +558,13 @@ class SlimTensor {
556558
static_cast<int64_t>(numel_));
557559
}
558560

561+
void check_supportive() const {
562+
ET_CHECK_MSG(
563+
c10::isValidScalarType(dtype_),
564+
"invalid dtype %d",
565+
static_cast<int>(dtype_));
566+
}
567+
559568
Storage storage_;
560569
int64_t storage_offset_{0};
561570
c10::SizesAndStrides sizes_and_strides_;

backends/aoti/slim/core/storage.h

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -240,7 +240,11 @@ class MaybeOwningStorage {
240240
data_(data),
241241
capacity_(nbytes),
242242
deleter_(detail::noop),
243-
is_owning_(false) {}
243+
is_owning_(false) {
244+
if (!device.is_cuda() && !device.is_cpu()) {
245+
ET_CHECK_MSG(false, "Unsupported device type: %s", device.str().c_str());
246+
}
247+
}
244248

245249
/// Default constructor is deleted - storage must have a device.
246250
MaybeOwningStorage() = delete;

backends/aoti/slim/core/test/test_slimtensor_basic.cpp

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -489,4 +489,38 @@ TEST(SlimTensorBasicTest, DataPtrWithOffset) {
489489
EXPECT_EQ(data, static_cast<char*>(base) + 5 * sizeof(float));
490490
}
491491

492+
// =============================================================================
493+
// Dtype and Device Type Validation Tests
494+
// =============================================================================
495+
496+
TEST(SlimTensorValidationTest, InvalidDtypeUndefined) {
497+
std::vector<int64_t> sizes = {2, 3};
498+
std::vector<int64_t> strides = {3, 1};
499+
size_t nbytes = 6 * sizeof(float);
500+
Storage storage = make_cpu_storage(nbytes);
501+
502+
EXPECT_DEATH(
503+
SlimTensor(
504+
std::move(storage),
505+
makeArrayRef(sizes),
506+
makeArrayRef(strides),
507+
c10::ScalarType::Undefined),
508+
"");
509+
}
510+
511+
TEST(SlimTensorValidationTest, InvalidDtypeDouble) {
512+
std::vector<int64_t> sizes = {2, 3};
513+
std::vector<int64_t> strides = {3, 1};
514+
size_t nbytes = 6 * sizeof(double);
515+
Storage storage = make_cpu_storage(nbytes);
516+
517+
EXPECT_DEATH(
518+
SlimTensor(
519+
std::move(storage),
520+
makeArrayRef(sizes),
521+
makeArrayRef(strides),
522+
static_cast<c10::ScalarType>(7)), // Double = 7
523+
"");
524+
}
525+
492526
} // namespace executorch::backends::aoti::slim

backends/aoti/slim/factory/test/test_empty.cpp

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -233,6 +233,65 @@ TEST(EmptyTest, CanWriteAndReadData) {
233233
}
234234
}
235235

236+
// =============================================================================
237+
// Dtype and Device Type Validation Tests
238+
// =============================================================================
239+
240+
TEST(EmptyStridedTest, InvalidDtypeUndefined) {
241+
std::vector<int64_t> sizes = {2, 3};
242+
std::vector<int64_t> strides = {3, 1};
243+
244+
EXPECT_DEATH(
245+
empty_strided(
246+
makeArrayRef(sizes),
247+
makeArrayRef(strides),
248+
c10::ScalarType::Undefined),
249+
"");
250+
}
251+
252+
TEST(EmptyStridedTest, InvalidDtypeDouble) {
253+
std::vector<int64_t> sizes = {2, 3};
254+
std::vector<int64_t> strides = {3, 1};
255+
256+
EXPECT_DEATH(
257+
empty_strided(
258+
makeArrayRef(sizes),
259+
makeArrayRef(strides),
260+
static_cast<c10::ScalarType>(7)), // Double = 7
261+
"");
262+
}
263+
264+
TEST(EmptyStridedTest, InvalidDeviceType) {
265+
std::vector<int64_t> sizes = {2, 3};
266+
std::vector<int64_t> strides = {3, 1};
267+
268+
c10::Device invalid_device(static_cast<c10::DeviceType>(100), 0);
269+
270+
EXPECT_DEATH(
271+
empty_strided(
272+
makeArrayRef(sizes),
273+
makeArrayRef(strides),
274+
c10::ScalarType::Float,
275+
invalid_device),
276+
"");
277+
}
278+
279+
TEST(EmptyTest, InvalidDtypeUndefined) {
280+
EXPECT_DEATH(empty({2, 3}, c10::ScalarType::Undefined), "");
281+
}
282+
283+
TEST(EmptyTest, InvalidDtypeDouble) {
284+
EXPECT_DEATH(
285+
empty({2, 3}, static_cast<c10::ScalarType>(7)), // Double = 7
286+
"");
287+
}
288+
289+
TEST(EmptyTest, InvalidDeviceType) {
290+
c10::Device invalid_device(static_cast<c10::DeviceType>(100), 0);
291+
292+
EXPECT_DEATH(empty({2, 3}, c10::ScalarType::Float, invalid_device), "");
293+
}
294+
236295
#ifdef CUDA_AVAILABLE
237296

238297
// =============================================================================

backends/aoti/slim/factory/test/test_from_blob.cpp

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -315,6 +315,37 @@ TEST(FromBlobTest, WithArrayRef) {
315315
EXPECT_TRUE(tensor.is_contiguous());
316316
}
317317

318+
// =============================================================================
319+
// Dtype and Device Type Validation Tests
320+
// =============================================================================
321+
322+
TEST(FromBlobTest, InvalidDtypeUndefined) {
323+
constexpr size_t kNumFloats = 6;
324+
float external_data[kNumFloats];
325+
326+
EXPECT_DEATH(
327+
from_blob(external_data, {2, 3}, c10::ScalarType::Undefined), "");
328+
}
329+
330+
TEST(FromBlobTest, InvalidDtypeDouble) {
331+
constexpr size_t kNumFloats = 6;
332+
float external_data[kNumFloats];
333+
334+
EXPECT_DEATH(
335+
from_blob(external_data, {2, 3}, static_cast<c10::ScalarType>(7)), "");
336+
}
337+
338+
TEST(FromBlobTest, InvalidDeviceType) {
339+
constexpr size_t kNumFloats = 6;
340+
float external_data[kNumFloats];
341+
342+
c10::Device invalid_device(static_cast<c10::DeviceType>(100), 0);
343+
344+
EXPECT_DEATH(
345+
from_blob(external_data, {2, 3}, c10::ScalarType::Float, invalid_device),
346+
"");
347+
}
348+
318349
// =============================================================================
319350
// CUDA from_blob Tests
320351
// Tests are skipped at runtime if CUDA hardware is not available.

0 commit comments

Comments
 (0)