Skip to content

Commit 7608f53

Browse files
committed
Update on "[Executorch][LLM] Use caching allocator for runner"
We observed that on iOS it improves perf by 6% because SDPA op does temp allocations. No significant difference on android though. Differential Revision: [D86120038](https://our.internmc.facebook.com/intern/diff/D86120038/) [ghstack-poisoned]
2 parents d63ffbd + e22cb35 commit 7608f53

19 files changed

Lines changed: 756 additions & 113 deletions

File tree

backends/arm/operators/op_index_select.py

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
)
2020
from executorch.backends.arm.tosa.mapping import TosaArg
2121

22-
from executorch.backends.arm.tosa.utils import build_reshape_tosa_1_0
22+
from executorch.backends.arm.tosa.utils import build_reshape_tosa
2323
from torch.fx import Node
2424

2525

@@ -67,7 +67,7 @@ def define_node(
6767
weights_new_shape,
6868
weights.dtype,
6969
)
70-
build_reshape_tosa_1_0(
70+
build_reshape_tosa(
7171
tosa_graph, weights.name, weights_new_shape, weights_reshaped.name
7272
)
7373

@@ -89,7 +89,7 @@ def define_node(
8989
indices_new_shape,
9090
indices.dtype,
9191
)
92-
build_reshape_tosa_1_0(
92+
build_reshape_tosa(
9393
tosa_graph, indices.name, indices_new_shape, indices_reshaped.name
9494
)
9595

@@ -106,6 +106,4 @@ def define_node(
106106

107107
if len(weights.shape) == 2:
108108
output_real_shape = [output.shape[0], output.shape[1]]
109-
build_reshape_tosa_1_0(
110-
tosa_graph, output_name, output_real_shape, output.name
111-
)
109+
build_reshape_tosa(tosa_graph, output_name, output_real_shape, output.name)

backends/arm/operators/op_index_tensor.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -180,7 +180,7 @@ def define_node(
180180
gather_idx_shape,
181181
index_dtype,
182182
)
183-
tutils.build_reshape_tosa_1_0(
183+
tutils.build_reshape_tosa(
184184
tosa_graph,
185185
stride_shifted_indices.name,
186186
gather_idx_shape,
@@ -212,7 +212,7 @@ def define_node(
212212
gather_vals_shape = [N, K, C]
213213
reshaped_input = tosa_graph.addIntermediate(gather_vals_shape, values.dtype)
214214

215-
tutils.build_reshape_tosa_1_0(
215+
tutils.build_reshape_tosa(
216216
tosa_graph,
217217
values.name,
218218
gather_vals_shape,
@@ -238,7 +238,7 @@ def define_node(
238238

239239
output_shape = tutils.tosa_shape(output.shape, output.dim_order)
240240

241-
tutils.build_reshape_tosa_1_0(
241+
tutils.build_reshape_tosa(
242242
tosa_graph,
243243
gather_out.name,
244244
list(output_shape),

backends/arm/test/ops/test_tanh.py

Lines changed: 9 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -8,21 +8,15 @@
88

99
import pytest
1010
import torch
11-
from executorch.backends.arm.quantizer.arm_quantizer import (
12-
get_symmetric_a16w8_quantization_config,
13-
TOSAQuantizer,
14-
)
1511

16-
from executorch.backends.arm.test import common, conftest
12+
from executorch.backends.arm.test import common
1713
from executorch.backends.arm.test.tester.test_pipeline import (
1814
EthosU55PipelineINT,
1915
EthosU85PipelineINT,
2016
TosaPipelineFP,
2117
TosaPipelineINT,
2218
VgfPipeline,
2319
)
24-
from executorch.backends.arm.tosa.specification import TosaSpecification
25-
from executorch.backends.xnnpack.test.tester import Quantize
2620

2721
aten_op = "torch.ops.aten.tanh.default"
2822
input_t1 = Tuple[torch.Tensor] # Input x
@@ -114,29 +108,6 @@ def test_tanh_vgf_INT(test_data: Tuple):
114108
pipeline.run()
115109

116110

117-
def get_symmetric_a16w8_tanh_quantizer(per_channel_quantization=False):
118-
tosa_version = conftest.get_option("tosa_version")
119-
tosa_profiles = {
120-
"1.0": TosaSpecification.create_from_string("TOSA-1.0+INT+int16"),
121-
}
122-
123-
quantizer = TOSAQuantizer(tosa_profiles[tosa_version])
124-
125-
# Use a smaller episilon value to not greatly inflate [qmin, qmax]
126-
quantizer.set_global(
127-
get_symmetric_a16w8_quantization_config(
128-
is_per_channel=per_channel_quantization, epsilon=2**-16
129-
)
130-
)
131-
132-
return Quantize(
133-
quantizer,
134-
get_symmetric_a16w8_quantization_config(
135-
is_per_channel=per_channel_quantization, epsilon=2**-16
136-
),
137-
)
138-
139-
140111
@common.parametrize("test_data", test_data_suite)
141112
def test_tanh_16a8w_tosa_INT(test_data: torch.Tensor):
142113
"""Test tanh operation with 16A8W quantization (16-bit activations, 8-bit weights)"""
@@ -150,13 +121,8 @@ def test_tanh_16a8w_tosa_INT(test_data: torch.Tensor):
150121
per_channel_quantization=per_channel_quantization,
151122
use_to_edge_transform_and_lower=True,
152123
tosa_extensions=["int16"],
153-
)
154-
155-
pipeline.change_args(
156-
"quantize",
157-
get_symmetric_a16w8_tanh_quantizer(
158-
per_channel_quantization=per_channel_quantization
159-
),
124+
epsilon=2**-16,
125+
rtol=2e-03,
160126
)
161127
pipeline.run()
162128

@@ -177,13 +143,9 @@ def test_tanh_16a8w_u55_INT16(test_data: torch.Tensor):
177143
exir_ops=[],
178144
per_channel_quantization=per_channel_quantization,
179145
use_to_edge_transform_and_lower=True,
180-
)
181-
182-
pipeline.change_args(
183-
"quantize",
184-
get_symmetric_a16w8_tanh_quantizer(
185-
per_channel_quantization=per_channel_quantization
186-
),
146+
a16w8_quantization=True,
147+
epsilon=2**-16,
148+
rtol=2e-03,
187149
)
188150
pipeline.run()
189151

@@ -201,12 +163,8 @@ def test_tanh_16a8w_u85_INT16(test_data: torch.Tensor):
201163
exir_ops=[],
202164
per_channel_quantization=per_channel_quantization,
203165
use_to_edge_transform_and_lower=True,
204-
)
205-
206-
pipeline.change_args(
207-
"quantize",
208-
get_symmetric_a16w8_tanh_quantizer(
209-
per_channel_quantization=per_channel_quantization
210-
),
166+
a16w8_quantization=True,
167+
epsilon=2**-16,
168+
rtol=2e-03,
211169
)
212170
pipeline.run()

backends/arm/test/tester/test_pipeline.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -376,7 +376,7 @@ def __init__(
376376
qtol: int = 1,
377377
dynamic_shapes: Optional[Tuple[Any]] = None,
378378
tosa_extensions: Optional[List[str]] = None,
379-
epsilon: float = 2**12,
379+
epsilon: float = 2**-12,
380380
):
381381
if tosa_extensions is None:
382382
tosa_extensions = []
@@ -570,7 +570,7 @@ def __init__(
570570
atol: float = 1e-03,
571571
rtol: float = 1e-03,
572572
qtol: int = 1,
573-
epsilon: float = 2**12,
573+
epsilon: float = 2**-12,
574574
):
575575
compile_spec = common.get_u55_compile_spec(
576576
custom_path=custom_path,
@@ -671,7 +671,7 @@ def __init__(
671671
atol: float = 1e-03,
672672
rtol: float = 1e-03,
673673
qtol: int = 1,
674-
epsilon: float = 2**12,
674+
epsilon: float = 2**-12,
675675
):
676676
compile_spec = common.get_u85_compile_spec(
677677
custom_path=custom_path,

backends/arm/tosa/utils.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -110,7 +110,7 @@ def broadcast_tensors(
110110
tens_dtype,
111111
)
112112

113-
build_reshape_tosa_1_0(tosa_fb, node.name, new_shape, reshaped.name)
113+
build_reshape_tosa(tosa_fb, node.name, new_shape, reshaped.name)
114114

115115
tiled = tosa_fb.addIntermediate(common_shape, tens_dtype)
116116
multipliers = [
@@ -137,7 +137,7 @@ def broadcast_tensors(
137137
return broadcast_tensors
138138

139139

140-
def build_reshape_tosa_1_0(
140+
def build_reshape_tosa(
141141
tosa_graph, input_name, new_shape, output_name, shape_name_override=""
142142
):
143143
"""Insert a TOSA reshape operator using the v1.0 semantics.

backends/vulkan/runtime/graph/ops/glsl/clone.glsl

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,10 @@ layout(std430) buffer;
1616

1717
${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)}
1818
${layout_declare_tensor(B, "r", "t_in", DTYPE, STORAGE)}
19-
${layout_declare_ubo(B, "ivec3", "out_limits")}
19+
20+
layout(push_constant) uniform restrict Block {
21+
ivec3 out_limits;
22+
};
2023

2124
layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
2225

backends/vulkan/runtime/graph/ops/impl/Clone.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -48,9 +48,9 @@ void add_clone_node(
4848
// Inputs and Outputs
4949
{{out, vkapi::kWrite}, {in, vkapi::kRead}},
5050
// Parameter Buffers
51-
{graph.logical_limits_ubo(out)},
52-
// Push Constants
5351
{},
52+
// Push Constants
53+
{graph.logical_limits_pc_of(out)},
5454
// Specialization Constants
5555
{},
5656
// Resize Args

examples/arm/executor_runner/arm_executor_runner.cpp

Lines changed: 26 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,7 @@
8787
#include <errno.h>
8888
#include <executorch/extension/data_loader/buffer_data_loader.h>
8989
#include <executorch/extension/runner_util/inputs.h>
90+
#include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>
9091
#include <executorch/runtime/core/memory_allocator.h>
9192
#include <executorch/runtime/executor/program.h>
9293
#include <executorch/runtime/platform/log.h>
@@ -95,6 +96,7 @@
9596
#include <stdio.h>
9697
#include <unistd.h>
9798
#include <memory>
99+
#include <type_traits>
98100
#include <vector>
99101

100102
#include "arm_memory_allocator.h"
@@ -183,6 +185,7 @@ using executorch::runtime::Result;
183185
using executorch::runtime::Span;
184186
using executorch::runtime::Tag;
185187
using executorch::runtime::TensorInfo;
188+
using executorch::runtime::toString;
186189
#if defined(ET_BUNDLE_IO)
187190
using executorch::bundled_program::compute_method_output_error_stats;
188191
using executorch::bundled_program::ErrorStats;
@@ -395,6 +398,19 @@ class Box {
395398
}
396399
};
397400

401+
template <typename ValueType>
402+
void fill_tensor_with_default_value(Tensor& tensor) {
403+
ValueType fill_value{};
404+
if constexpr (std::is_same_v<ValueType, bool>) {
405+
fill_value = true;
406+
} else {
407+
fill_value = ValueType(1);
408+
}
409+
410+
ValueType* data_ptr = tensor.mutable_data_ptr<ValueType>();
411+
std::fill(data_ptr, data_ptr + tensor.numel(), fill_value);
412+
}
413+
398414
Error prepare_input_tensors(
399415
Method& method,
400416
MemoryAllocator& allocator,
@@ -452,32 +468,17 @@ Error prepare_input_tensors(
452468
if (input_evalues[i].isTensor()) {
453469
Tensor& tensor = input_evalues[i].toTensor();
454470
switch (tensor.scalar_type()) {
455-
case ScalarType::Int:
456-
std::fill(
457-
tensor.mutable_data_ptr<int>(),
458-
tensor.mutable_data_ptr<int>() + tensor.numel(),
459-
1);
460-
break;
461-
case ScalarType::Float:
462-
std::fill(
463-
tensor.mutable_data_ptr<float>(),
464-
tensor.mutable_data_ptr<float>() + tensor.numel(),
465-
1.0);
466-
break;
467-
case ScalarType::Char:
468-
std::fill(
469-
tensor.mutable_data_ptr<int8_t>(),
470-
tensor.mutable_data_ptr<int8_t>() + tensor.numel(),
471-
1);
472-
break;
473-
case ScalarType::Bool:
474-
std::fill(
475-
tensor.mutable_data_ptr<int8_t>(),
476-
tensor.mutable_data_ptr<int8_t>() + tensor.numel(),
477-
1);
478-
break;
471+
#define HANDLE_SCALAR_TYPE(cpp_type, scalar_name) \
472+
case ScalarType::scalar_name: \
473+
fill_tensor_with_default_value<cpp_type>(tensor); \
474+
break;
475+
ET_FORALL_SCALAR_TYPES(HANDLE_SCALAR_TYPE)
476+
#undef HANDLE_SCALAR_TYPE
479477
default:
480-
ET_LOG(Error, "Unhandled ScalarType");
478+
ET_LOG(
479+
Error,
480+
"Unhandled ScalarType %s",
481+
toString(tensor.scalar_type()));
481482
err = Error::InvalidArgument;
482483
break;
483484
}

0 commit comments

Comments
 (0)