|
| 1 | +# QNN Lowering / Export |
| 2 | + |
| 3 | +## Common Setup |
| 4 | + |
| 5 | +```python |
| 6 | +from executorch.backends.qualcomm.serialization.qc_schema import QnnExecuTorchBackendType |
| 7 | +from executorch.backends.qualcomm.utils.utils import ( |
| 8 | + generate_htp_compiler_spec, |
| 9 | + generate_qnn_executorch_compiler_spec, |
| 10 | + get_soc_to_chipset_map, |
| 11 | + to_edge_transform_and_lower_to_qnn, |
| 12 | +) |
| 13 | + |
| 14 | +soc_model = get_soc_to_chipset_map()["SM8650"] # adjust SoC as needed |
| 15 | +``` |
| 16 | + |
| 17 | +--- |
| 18 | + |
| 19 | +## FP16 Export |
| 20 | + |
| 21 | +```python |
| 22 | +backend_options = generate_htp_compiler_spec(use_fp16=True) |
| 23 | +compiler_specs = generate_qnn_executorch_compiler_spec( |
| 24 | + soc_model=soc_model, |
| 25 | + backend_options=backend_options, |
| 26 | +) |
| 27 | +edge_prog_mgr = to_edge_transform_and_lower_to_qnn(model, example_inputs, compiler_specs) |
| 28 | +et_program = edge_prog_mgr.to_executorch() |
| 29 | +``` |
| 30 | + |
| 31 | +--- |
| 32 | + |
| 33 | +## Quantized (PTQ) Export |
| 34 | + |
| 35 | +```python |
| 36 | +import torch |
| 37 | +from torchao.quantization.pt2e.quantize_pt2e import convert_pt2e, prepare_pt2e |
| 38 | +from executorch.backends.qualcomm.quantizer.quantizer import QnnQuantizer |
| 39 | + |
| 40 | +# 1. Export to ATen IR |
| 41 | +m = torch.export.export(model.eval(), example_inputs, strict=True).module() |
| 42 | + |
| 43 | +# 2. Prepare for quantization |
| 44 | +quantizer = QnnQuantizer( |
| 45 | + backend=QnnExecuTorchBackendType.kHtpBackend, |
| 46 | + soc_model=soc_model, |
| 47 | +) |
| 48 | +m = prepare_pt2e(m, quantizer) |
| 49 | + |
| 50 | +# 3. Calibrate |
| 51 | +m(*example_inputs) |
| 52 | + |
| 53 | +# 4. Convert |
| 54 | +m = convert_pt2e(m) |
| 55 | + |
| 56 | +# 5. Lower to QNN |
| 57 | +backend_options = generate_htp_compiler_spec(use_fp16=False) |
| 58 | +compiler_specs = generate_qnn_executorch_compiler_spec( |
| 59 | + soc_model=soc_model, |
| 60 | + backend_options=backend_options, |
| 61 | +) |
| 62 | +edge_prog_mgr = to_edge_transform_and_lower_to_qnn(m, example_inputs, compiler_specs) |
| 63 | +et_program = edge_prog_mgr.to_executorch() |
| 64 | +``` |
| 65 | + |
| 66 | +--- |
| 67 | + |
| 68 | +## Quantized (QAT) Export |
| 69 | + |
| 70 | +Same as PTQ but use `prepare_qat_pt2e` and run a training loop instead of calibration: |
| 71 | + |
| 72 | +```python |
| 73 | +from torchao.quantization.pt2e.quantize_pt2e import convert_pt2e, prepare_qat_pt2e |
| 74 | + |
| 75 | +m = prepare_qat_pt2e(m, quantizer) |
| 76 | +# training loop |
| 77 | +m(*example_inputs) |
| 78 | +m = convert_pt2e(m) |
| 79 | +# ... same lowering steps as PTQ |
| 80 | +``` |
| 81 | + |
| 82 | +--- |
| 83 | + |
| 84 | +## Quantization Options |
| 85 | + |
| 86 | +| QuantDtype | Activation | Weight | |
| 87 | +|---|---|---| |
| 88 | +| `use_16a16w` | uint16 | int16 | |
| 89 | +| `use_16a8w` | uint16 | int8 | |
| 90 | +| `use_16a4w` | uint16 | int4 | |
| 91 | +| `use_16a4w_block` | uint16 | int4 (block-wise) | |
| 92 | +| `use_8a8w` | uint8 | int8 | |
| 93 | +| `use_8a4w` | uint8 | int4 | |
| 94 | + |
| 95 | +**Fine-grained control with QuantRecipe:** |
| 96 | + |
| 97 | +```python |
| 98 | +from executorch.backends.qualcomm.quantizer.quant_recipe import QuantRecipe, QuantGranularity |
| 99 | + |
| 100 | +recipe = QuantRecipe(quant_dtype=QuantDtype.use_8a8w, is_qat=False) |
| 101 | +recipe.add_node_target(targets={torch.ops.aten.linear.default}, quant_dtype=QuantDtype.use_16a8w) |
| 102 | +recipe.add_regex(regex={"layers.[0-3].attention"}, quant_dtype=QuantDtype.use_16a4w) |
| 103 | +``` |
| 104 | + |
| 105 | +--- |
| 106 | + |
| 107 | +## Pass Pipelines (QnnPassManager) |
| 108 | + |
| 109 | +| Pipeline | When Called | Key Passes | |
| 110 | +|---|---|---| |
| 111 | +| `transform_for_annotation_pipeline` | Before `prepare_pt2e` (called internally by `QnnQuantizer`) | RemoveRedundancy, Decompose*, Recompose*, ReplaceInfValues | |
| 112 | +| `transform_for_export_pipeline` | After `torch.export` | Decompose*, CanonicalizeConv, LiftConstantScalarOperands | |
| 113 | +| `get_to_edge_transform_passes` | Before `to_edge` | AnnotateQuantAttrs, FoldQDQ, LayoutTransform, TagQuantIO, **ResolveDebugHandle (must be last)** | |
| 114 | +| `transform_for_preprocess_pipeline` | Inside `QnnBackend.preprocess` | FoldQDQ(force_fold=True), InsertRequantize, InsertIOQDQ, LayoutTransform(insert_permute=True), FuseConsecutiveCast | |
| 115 | + |
| 116 | +--- |
| 117 | + |
| 118 | +## Skipping Ops / Partial Delegation |
| 119 | + |
| 120 | +```python |
| 121 | +from executorch.backends.qualcomm.utils.utils import skip_annotation |
| 122 | + |
| 123 | +# Skip specific node targets from being delegated |
| 124 | +skip_annotation(model, skipped_ops={torch.ops.aten.add.Tensor}) |
| 125 | +``` |
| 126 | + |
| 127 | +--- |
| 128 | + |
| 129 | +## Dumping Context Binary |
| 130 | + |
| 131 | +```python |
| 132 | +from executorch.backends.qualcomm.utils.utils import dump_context_from_pte |
| 133 | + |
| 134 | +dump_context_from_pte("model.pte", output_dir="./context_bins/") |
| 135 | +``` |
| 136 | + |
| 137 | +--- |
| 138 | + |
| 139 | +## SoC Reference |
| 140 | + |
| 141 | +See `_soc_info_table` in `backends/qualcomm/serialization/qc_schema.py`. |
0 commit comments