|
34 | 34 | from executorch.extension.llm.export.config.llm_config import LlmConfig |
35 | 35 | from executorch.extension.llm.export.partitioner_lib import ( |
36 | 36 | get_coreml_partitioner, |
| 37 | + get_ethosu_partitioner, |
37 | 38 | get_mps_partitioner, |
38 | 39 | get_openvino_partitioner, |
39 | 40 | get_qnn_partitioner, |
|
43 | 44 | ) |
44 | 45 | from executorch.extension.llm.export.quantizer_lib import ( |
45 | 46 | get_coreml_quantizer, |
| 47 | + get_ethosu_quantizer, |
46 | 48 | get_ov_quantizer, |
47 | 49 | get_pt2e_quantization_params, |
48 | 50 | get_pt2e_quantizers, |
@@ -218,6 +220,7 @@ def build_args_parser() -> argparse.ArgumentParser: |
218 | 220 | "coreml_baseline_8a_c4w", |
219 | 221 | "vulkan_8w", |
220 | 222 | "tosa_8a8w", |
| 223 | + "ethosu_8a8w", |
221 | 224 | ], |
222 | 225 | help="Use PT2E quantization. Comma separated options. e.g. xnnpack_dynamic (for per channel 8 bit weight), xnnpack_dynamic_qc4 (for per channel 4 bit weight), embedding.", |
223 | 226 | ) |
@@ -813,6 +816,14 @@ def get_quantizer_and_quant_params(llm_config): |
813 | 816 | llm_config.backend.tosa.version, llm_config.quantization.pt2e_quantize.value |
814 | 817 | ) |
815 | 818 | quantizers.append(tosa_quantizer) |
| 819 | + if llm_config.backend.ethosu.enabled and llm_config.quantization.pt2e_quantize: |
| 820 | + ethosu_quantizer = get_ethosu_quantizer( |
| 821 | + llm_config.backend.ethosu.target, |
| 822 | + llm_config.backend.ethosu.system_config, |
| 823 | + llm_config.backend.ethosu.memory_mode, |
| 824 | + llm_config.quantization.pt2e_quantize.value, |
| 825 | + ) |
| 826 | + quantizers.append(ethosu_quantizer) |
816 | 827 | if llm_config.backend.vulkan.enabled and llm_config.quantization.pt2e_quantize: |
817 | 828 | assert ( |
818 | 829 | len(quantizers) == 0 |
@@ -984,20 +995,27 @@ def _to_edge_and_lower_llama_openvino( |
984 | 995 | return builder.to_executorch(passes=additional_passes) |
985 | 996 |
|
986 | 997 |
|
987 | | -def _to_edge_and_lower_llama_tosa( |
| 998 | +def _to_edge_and_lower_llama_arm( |
988 | 999 | builder_exported, |
989 | 1000 | modelname, |
990 | 1001 | quantizers, |
991 | 1002 | additional_passes, |
992 | | - tosa_spec, |
| 1003 | + llm_config: LlmConfig, |
993 | 1004 | verbose: bool = False, |
994 | 1005 | ) -> LLMEdgeManager: |
995 | 1006 | logging.info("Lowering model using TOSA partitioner") |
996 | 1007 |
|
997 | 1008 | partitioners = [] |
998 | | - partitioners.append(get_tosa_partitioner(tosa_spec)) |
999 | | - |
1000 | | - modelname = f"tosa_{modelname}" |
| 1009 | + if llm_config.backend.ethosu.enabled: |
| 1010 | + partitioners.append( |
| 1011 | + get_ethosu_partitioner( |
| 1012 | + llm_config.backend.ethosu.target, |
| 1013 | + ) |
| 1014 | + ) |
| 1015 | + modelname = f"ethosu_{modelname}" |
| 1016 | + elif llm_config.backend.tosa.enabled: |
| 1017 | + partitioners.append(get_tosa_partitioner(llm_config.backend.tosa.version)) |
| 1018 | + modelname = f"tosa_{modelname}" |
1001 | 1019 |
|
1002 | 1020 | builder = builder_exported.pt2e_quantize(quantizers).to_edge_transform_and_lower( |
1003 | 1021 | partitioners |
@@ -1365,13 +1383,13 @@ def _export_llama(llm_config: LlmConfig) -> LLMEdgeManager: # noqa: C901 |
1365 | 1383 | openvino_device=llm_config.backend.openvino.device, |
1366 | 1384 | verbose=llm_config.debug.verbose, |
1367 | 1385 | ) |
1368 | | - elif llm_config.backend.tosa.enabled: |
1369 | | - builder = _to_edge_and_lower_llama_tosa( |
| 1386 | + elif llm_config.backend.tosa.enabled or llm_config.backend.ethosu.enabled: |
| 1387 | + builder = _to_edge_and_lower_llama_arm( |
1370 | 1388 | builder_exported, |
1371 | 1389 | modelname, |
1372 | 1390 | quantizers, |
1373 | 1391 | additional_passes, |
1374 | | - llm_config.backend.tosa.version, |
| 1392 | + llm_config, |
1375 | 1393 | verbose=llm_config.debug.verbose, |
1376 | 1394 | ) |
1377 | 1395 | else: |
|
0 commit comments