executorch/backends/vulkan/vulkan_preprocess.py at main · QuantumManiac/executorch · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

# pyre-strict

import copy
from functools import partial
from typing import Any, Callable, Dict, final, List

import executorch.backends.vulkan.utils as utils
from executorch.backends.transforms.addmm_mm_to_linear import AddmmToLinearTransform
from executorch.backends.transforms.fuse_conv_with_clamp import FuseClampPass
from executorch.backends.transforms.fuse_view_copy import FuseViewCopyTransform
from executorch.backends.transforms.view_copy_to_squeeze_unsqueeze import (
    ViewCopyToSqueezeUnsqueezePass,
)
from executorch.backends.vulkan._passes import (
    FoldQDQPass,
    FuseQuantizedOpsTransform,
    insert_prepack_nodes,
    InsertDtypePromotionPass,
    RemoveRedundantOpsTransform,
    SqueezeUnsqueezeInputs,
    TagMemoryMetaPass,
)
from executorch.backends.vulkan._passes.fuse_patterns import FusePatternsPass
from executorch.backends.vulkan._passes.remove_asserts import RemoveAssertsTransform
from executorch.backends.vulkan.serialization.vulkan_graph_builder import VkGraphBuilder
from executorch.backends.vulkan.serialization.vulkan_graph_schema import (
    VkMemoryLayout,
    VkStorageType,
)
from executorch.backends.vulkan.serialization.vulkan_graph_serialize import (
    serialize_vulkan_graph,
)
from executorch.backends.xnnpack._passes import FuseBatchNormPass
from executorch.exir.backend.backend_details import (
    BackendDetails,
    CompileSpec,
    ExportedProgram,
    PreprocessResult,
)
from executorch.exir.backend.utils import DelegateMappingBuilder
from executorch.exir.memory_planning import greedy, MemoryPlanningAlgorithmSuite
from executorch.exir.pass_base import ExportPass, PassBase
from executorch.exir.passes import MemoryPlanningPass, SpecPropPass
from executorch.exir.passes.sym_shape_eval_pass import ConstraintBasedSymShapeEvalPass
from executorch.exir.program._program import _transform
from torch._export.verifier import Verifier
from torch.export._remove_auto_functionalized_pass import (
    unsafe_remove_auto_functionalized_pass,
)

DEFAULT_DEBUG_HANDLE = 65535


class _any_op(Verifier):
    # Set training dialect to skip functional check in base verifier
    dialect = "TRAINING"

    def allowed_op_types(self):
        return (Callable,)


# pyre-ignore
def apply_passes(program: ExportedProgram, passes) -> ExportedProgram:
    for p in passes:
        if isinstance(p, MemoryPlanningPass) and hasattr(p, "run"):
            p.run(program.graph_module)

        elif issubclass(type(p), ExportPass) or issubclass(type(p), PassBase):
            # Some passes require the ep to be provided. However, since the ep may be
            # updated with each pass applied, the ep must be set right before calling
            # the pass. _exported_program is the attribute used by XNNPACK and Vulkan
            # passes to store the exported program.
            if hasattr(p, "_exported_program"):
                p._exported_program = program

            program = _transform(program, p, override_verifiers=[_any_op])
            # See the application of this function in exir/program/_program.py for more
            # details on why this step is necessary.
            if isinstance(p, SpecPropPass):
                p.update_placeholder_tensor_specs(program, program.graph_module)

        else:
            program = p(program)

    return program


def parse_compile_spec(compile_specs: List[CompileSpec]) -> Dict[str, Any]:
    options = {}
    for spec in compile_specs:
        if spec.key == "storage_type_override":
            options[spec.key] = VkStorageType(
                int.from_bytes(spec.value, byteorder="little")
            )
        if spec.key == "memory_layout_override":
            options[spec.key] = VkMemoryLayout(
                int.from_bytes(spec.value, byteorder="little")
            )
        if spec.key in {"texture_limits_x", "texture_limits_y", "texture_limits_z"}:
            options[spec.key] = int.from_bytes(spec.value, byteorder="little")

        if spec.key == "skip_tag_memory_metadata":
            options[spec.key] = bool.from_bytes(spec.value, byteorder="little")

        if spec.key == "downcast_64_bit":
            options[spec.key] = bool.from_bytes(spec.value, byteorder="little")

        if spec.key == "force_fp16":
            options[spec.key] = bool.from_bytes(spec.value, byteorder="little")

        # Unhandled options are ignored

    return options


@final
class VulkanBackend(BackendDetails):
    @classmethod
    # pyre-ignore
    def preprocess(  # noqa: C901
        cls,
        program: ExportedProgram,
        module_compile_spec: List[CompileSpec],
    ) -> PreprocessResult:
        compile_options = parse_compile_spec(module_compile_spec)

        default_texture_limits = copy.deepcopy(utils.DEFAULT_TEXTURE_LIMITS)
        # 2048 is the typical limit value for 3D textures, but mobile GPUs often support
        # 16384. Since the Vulkan delegate primarily targets mobile GPUs at the moment,
        # 16394 is the default texture limit used. This option is provided as a
        # convenient way to switch to using a limit of 2048 for image textures which
        # will be compatible with most GPUs.
        if compile_options.get("small_texture_limits", False):
            default_texture_limits[0] = 2048
            default_texture_limits[1] = 2048
            default_texture_limits[2] = 2048

        limits_x = compile_options.get("texture_limits_x", default_texture_limits[0])
        limits_y = compile_options.get("texture_limits_y", default_texture_limits[1])
        limits_z = compile_options.get("texture_limits_z", default_texture_limits[2])
        texture_limits = (limits_x, limits_y, limits_z)

        default_storage_type = compile_options.get(
            "storage_type_override", VkStorageType.TEXTURE_3D
        )
        default_memory_layout = compile_options.get(
            "memory_layout_override", VkMemoryLayout.TENSOR_WIDTH_PACKED
        )
        downcast_64_bit = compile_options.get("downcast_64_bit", True)
        force_fp16 = compile_options.get("force_fp16", False)

        program = unsafe_remove_auto_functionalized_pass(program)

        # First, apply passes that fuse/remove operators to consolidate the graph
        # structure but still preserve an "ATen-compliant" graph structure (i.e. all
        # arguments to ATen operators must match the ATen function schema).
        program = apply_passes(
            program,
            [
                AddmmToLinearTransform(),
                FuseBatchNormPass(program),
                AddmmToLinearTransform(),
                InsertDtypePromotionPass(),
                FusePatternsPass(),
                FuseClampPass(),
                RemoveRedundantOpsTransform(),
                FuseQuantizedOpsTransform(),
                FoldQDQPass(),
                SqueezeUnsqueezeInputs(),
                FuseViewCopyTransform(),
                ViewCopyToSqueezeUnsqueezePass(),
            ],
        )

        # Next annotate tensor nodes with TensorSpec structs which is needed for dynamic
        # shapes and memory planning. Until this point, the graph must be ATen compliant
        # because SpecPropPass will be calling the underlying ATen operators during its
        # execution.
        program = apply_passes(program, [SpecPropPass()])

        # Apply graph transforms which either require `TensorSpec`s to have been created
        # or would create an non ATen compliant graph structure.
        program = apply_passes(
            program,
            [
                RemoveAssertsTransform(),
                insert_prepack_nodes,
            ],
        )

        # Optionally apply the memory metadata tagging pass, which will insert storage
        # type and memory layout transition nodes to ensure that all tensor arguments
        # to an operator is in a supported or optimal configuration. If this pass is not
        # applied, there will be a risk that some operators recieve arguments with
        # memory settings that are not supported by the implementation.
        if not compile_options.get("skip_tag_memory_metadata", False):
            program = apply_passes(
                program,
                [
                    TagMemoryMetaPass(
                        texture_limits,
                        default_storage_type=default_storage_type,
                        default_memory_layout=default_memory_layout,
                        force_fp16=force_fp16,
                    ),
                ],
            )

        # Finally, apply dynamic shape passes and memory planning pass. These passes
        # must be applied only when the graph structure is finalized.
        final_passes = [
            ConstraintBasedSymShapeEvalPass(),
        ]
        if not compile_options.get("skip_memory_planning", False):
            greedy_memory_planning = partial(
                greedy, allow_overlapping_allocations=False
            )
            mem_planning_suite = MemoryPlanningAlgorithmSuite(
                algo_list=[greedy_memory_planning]
            )
            # This is a workaround to allow the memory planning pass to work without having
            # to first apply ToOutVarPass(). See the `greedy()` function in
            # `exir.memory_planning`; if this attribute isn't set, assertions in
            # `collect_spec_from_nodes()` will fail.
            program.graph_module.encounter_to_out_var_failure = True
            final_passes.append(
                MemoryPlanningPass(memory_planning_algo=mem_planning_suite)
            )

        program = apply_passes(program, final_passes)

        graph_builder = VkGraphBuilder(
            program,
            DelegateMappingBuilder(generated_identifiers=True),
            downcast_64_bit=downcast_64_bit,
            force_fp16=force_fp16,
        )
        vk_graph = graph_builder.build_graph()

        return PreprocessResult(
            processed_bytes=serialize_vulkan_graph(
                vk_graph, graph_builder.const_tensors, []
            ),
            debug_handle_map=graph_builder.delegate_mapping_builder.get_delegate_mapping(),
            data_store_output=graph_builder.named_data_store.get_named_data_store_output(),
        )