Commit b63392e
feat: Add SME GEMM and GEMV kernels.
Add SME versions of most SME2 kernels.
As a side effect, synchronize with the latest upstream arm_gemm repo.
This includes deleting a substantial amount of dead files that weren't
being used for anything. arm_gemm include files are now stored in a
slightly different place, so various other places that refer to these
are updated to match.
Partially Resolves: COMPMID-8788, ARMCL-1239
Signed-off-by: David Mansell <David.Mansell@arm.com>
Change-Id: Iaa6f5a8caa4109098b8b5f9173df0909a4e940d6
Signed-off-by: Gunes Bayir <gunes.bayir@arm.com>1 parent 902f9a7 commit b63392e
File tree
663 files changed
+125147
-100273
lines changed- scripts
- src
- core
- CPP
- NEON/kernels
- arm_conv
- depthwise
- interleaves
- kernels
- a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst
- a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst
- a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst
- a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst
- a64_s8q_nhwc_generic_output9_mla_depthfirst
- a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst
- a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst
- a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst
- a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst
- a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst
- a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst
- a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst
- a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst
- a64_u8q_nhwc_generic_output9_mla_depthfirst
- a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst
- a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst
- a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst
- a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst
- a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst
- a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst
- a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst
- a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst
- a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst
- a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst
- a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst
- sme2_s8q_planar_3x3_s1_4rows_dot_za
- sme2_s8q_planar_3x3_s2_4rows_dot_za
- sme2_s8q_planar_5x5_s1_4rows_dot_za
- sme2_s8q_planar_5x5_s2_4rows_dot_za
- sme2_u8q_planar_3x3_s1_4rows_dot_za
- sme2_u8q_planar_3x3_s2_4rows_dot_za
- sme2_u8q_planar_5x5_s1_4rows_dot_za
- sme2_u8q_planar_5x5_s2_4rows_dot_za
- sme2_u8s8u8q_planar_3x3_s1_4rows_dot_za
- sme2_u8s8u8q_planar_3x3_s2_4rows_dot_za
- sme2_u8s8u8q_planar_5x5_s1_4rows_dot_za
- sme2_u8s8u8q_planar_5x5_s2_4rows_dot_za
- sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst
- sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst
- sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst
- sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst
- sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst
- sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst
- sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst
- sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst
- sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst
- sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst
- sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst
- sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst
- sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst
- sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst
- sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst
- sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst
- pooling
- kernels/cpp_nhwc_1x1_stride_any_depthfirst
- arm_gemm
- indirect-interleaves
- kernels
- a32_sgemm_8x6
- a64_ffhybrid_bf16fp32_mmla_6x16
- a64_ffhybrid_fp16_mla_6x32
- a64_ffhybrid_fp16fp32_mla_6x16
- a64_ffhybrid_fp16fp32fp16_mla_6x16
- a64_ffhybrid_fp32_mla_6x16
- a64_ffhybrid_fp32bf16fp32_mmla_4x24
- a64_ffhybrid_fp32bf16fp32_mmla_6x16
- a64_ffinterleaved_bf16fp32_dot_8x12
- a64_ffinterleaved_bf16fp32_mmla_8x12
- a64_ffinterleaved_fp16_mla_8x24
- a64_ffinterleaved_fp32_mla_8x12
- a64_gemm_s16_8x12
- a64_gemm_s8_4x4
- a64_gemm_s8_8x12
- a64_gemm_u16_8x12
- a64_gemm_u8_4x4
- a64_gemm_u8_8x12
- a64_hgemm_8x24
- a64_hybrid_bf16fp32_dot_6x16
- a64_hybrid_bf16fp32_mmla_6x16
- a64_hybrid_fp16_mla_6x32
- a64_hybrid_fp16fp32_mla_6x16
- a64_hybrid_fp16fp32fp16_mla_6x16
- a64_hybrid_fp32_mla_4x24
- a64_hybrid_fp32_mla_6x16
- a64_hybrid_fp32_mla_8x4
- a64_hybrid_fp32bf16fp32_mmla_4x24
- a64_hybrid_fp32bf16fp32_mmla_6x16
- a64_hybrid_s8qa_dot_4x16
- a64_hybrid_s8qa_mmla_4x16
- a64_hybrid_s8qs_dot_6x16
- a64_hybrid_s8qs_mmla_6x16
- a64_hybrid_s8s32_dot_6x16
- a64_hybrid_s8s32_mmla_6x16
- a64_hybrid_u8qa_dot_4x16
- a64_hybrid_u8qa_mmla_4x16
- a64_hybrid_u8s8qa_dot_4x16
- a64_hybrid_u8s8qa_mmla_4x16
- a64_hybrid_u8s8s32_dot_6x16
- a64_hybrid_u8s8s32_mmla_6x16
- a64_hybrid_u8u32_dot_6x16
- a64_hybrid_u8u32_mmla_6x16
- a64_interleaved_bf16fp32_dot_8x12
- a64_interleaved_bf16fp32_mmla_8x12
- a64_interleaved_s8s32_mmla_8x12
- a64_interleaved_u8s8s32_mmla_8x12
- a64_interleaved_u8u32_mmla_8x12
- a64_sgemm_8x12
- a64_sgemm_8x6
- a64_sgemv_pretransposed
- a64_smallK_hybrid_fp32_mla_6x4
- a64_smallK_hybrid_fp32_mla_8x4
- a64_smallK_hybrid_s8s32_dot_6x4
- a64_smallK_hybrid_s8s32_dot_8x4
- a64_smallK_hybrid_u8u32_dot_6x4
- a64_smallK_hybrid_u8u32_dot_8x4
- sme2_gemv_bf16fp32_dot_16VL
- sme2_gemv_fp16_mla_16VL
- sme2_gemv_fp16fp32fp16_dot_16VL
- sme2_gemv_fp32_mla_16VL
- sme2_gemv_fp32bf16fp32_dot_16VL
- sme2_gemv_s8qa_dot_16VL
- sme2_gemv_u8qa_dot_16VL
- sme2_interleaved_nomerge_bf16fp32_mopa_1VLx4VL
- sme2_interleaved_nomerge_bf16fp32_mopa_2VLx2VL
- sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL
- sme2_interleaved_nomerge_fp16fp32_mopa_1VLx4VL
- sme2_interleaved_nomerge_fp16fp32_mopa_2VLx2VL
- sme2_interleaved_nomerge_fp16fp32_mopa_4VLx1VL
- sme2_interleaved_nomerge_fp16fp32fp16_mopa_1VLx4VL
- sme2_interleaved_nomerge_fp16fp32fp16_mopa_2VLx2VL
- sme2_interleaved_nomerge_fp16fp32fp16_mopa_4VLx1VL
- sme2_interleaved_nomerge_fp32_mopa_1VLx4VL
- sme2_interleaved_nomerge_fp32_mopa_2VLx2VL
- sme2_interleaved_nomerge_fp32_mopa_4VLx1VL
- sme2_interleaved_nomerge_s8q_mopa_1VLx4VL
- sme2_interleaved_nomerge_s8q_mopa_2VLx2VL
- sme2_interleaved_nomerge_s8q_mopa_4VLx1VL
- sme2_interleaved_nomerge_s8qfp32_mopa_1VLx4VL
- sme2_interleaved_nomerge_s8qfp32_mopa_2VLx2VL
- sme2_interleaved_nomerge_s8qfp32_mopa_4VLx1VL
- sme2_interleaved_nomerge_s8s32_mopa_1VLx4VL
- sme2_interleaved_nomerge_s8s32_mopa_2VLx2VL
- sme2_interleaved_nomerge_s8s32_mopa_4VLx1VL
- sme2_interleaved_nomerge_u8q_mopa_1VLx4VL
- sme2_interleaved_nomerge_u8q_mopa_2VLx2VL
- sme2_interleaved_nomerge_u8q_mopa_4VLx1VL
- sme_gemv_bf16fp32_dot_8VL
- sme_gemv_fp16_mla_8VL
- sme_gemv_fp16fp32fp16_mla_8VL_rhs2VL
- sme_gemv_fp32_mla_8VL
- sme_gemv_fp32bf16fp32_dot_8VL
- sme_gemv_s8qa_dot_8VL
- sme_gemv_u8qa_dot_8VL
- sme_interleaved_nomerge_bf16fp32_mopa_1VLx4VL
- sme_interleaved_nomerge_bf16fp32_mopa_2VLx2VL
- sme_interleaved_nomerge_bf16fp32_mopa_4VLx1VL
- sme_interleaved_nomerge_fp16fp32_mopa_1VLx4VL
- sme_interleaved_nomerge_fp16fp32_mopa_2VLx2VL
- sme_interleaved_nomerge_fp16fp32_mopa_4VLx1VL
- sme_interleaved_nomerge_fp16fp32fp16_mopa_1VLx4VL
- sme_interleaved_nomerge_fp16fp32fp16_mopa_2VLx2VL
- sme_interleaved_nomerge_fp16fp32fp16_mopa_4VLx1VL
- sme_interleaved_nomerge_fp32_mopa_1VLx4VL
- sme_interleaved_nomerge_fp32_mopa_2VLx2VL
- sme_interleaved_nomerge_fp32_mopa_4VLx1VL
- sme_interleaved_nomerge_s8q_mopa_1VLx4VL
- sme_interleaved_nomerge_s8q_mopa_2VLx2VL
- sme_interleaved_nomerge_s8q_mopa_4VLx1VL
- sme_interleaved_nomerge_s8qfp32_mopa_1VLx4VL
- sme_interleaved_nomerge_s8qfp32_mopa_2VLx2VL
- sme_interleaved_nomerge_s8qfp32_mopa_4VLx1VL
- sme_interleaved_nomerge_s8s32_mopa_1VLx4VL
- sme_interleaved_nomerge_s8s32_mopa_2VLx2VL
- sme_interleaved_nomerge_s8s32_mopa_4VLx1VL
- sme_interleaved_nomerge_u8q_mopa_1VLx4VL
- sme_interleaved_nomerge_u8q_mopa_2VLx2VL
- sme_interleaved_nomerge_u8q_mopa_4VLx1VL
- sve_ffhybrid_bf16fp32_mmla_6x4VL
- sve_ffhybrid_fp16_mla_6x4VL
- sve_ffhybrid_fp16fp32_mla_6x4VL
- sve_ffhybrid_fp16fp32fp16_mla_6x4VL
- sve_ffhybrid_fp32_mla_6x4VL
- sve_ffhybrid_fp32bf16fp32_mmla_4x6VL
- sve_ffinterleaved_bf16fp32_dot_8x3VL
- sve_ffinterleaved_bf16fp32_mmla_8x3VL
- sve_ffinterleaved_fp16_mla_8x3VL
- sve_ffinterleaved_fp32_mla_8x3VL
- sve_hybrid_bf16fp32_dot_6x4VL
- sve_hybrid_bf16fp32_mmla_6x4VL
- sve_hybrid_fp16_mla_6x4VL
- sve_hybrid_fp16fp32_mla_6x4VL
- sve_hybrid_fp16fp32fp16_mla_6x4VL
- sve_hybrid_fp32_mla_6x4VL
- sve_hybrid_fp32_mla_8x1VL
- sve_hybrid_fp32bf16fp32_mmla_4x6VL
- sve_hybrid_fp32bf16fp32_mmla_6x4VL
- sve_hybrid_s8qa_dot_4x4VL
- sve_hybrid_s8qa_mmla_4x4VL
- sve_hybrid_s8qs_dot_6x4VL
- sve_hybrid_s8qs_mmla_6x4VL
- sve_hybrid_s8s32_dot_6x4VL
- sve_hybrid_s8s32_mmla_6x4VL
- sve_hybrid_u8qa_dot_4x4VL
- sve_hybrid_u8qa_mmla_4x4VL
- sve_hybrid_u8s8qa_dot_4x4VL
- sve_hybrid_u8s8qa_mmla_4x4VL
- sve_hybrid_u8s8s32_mmla_6x4VL
- sve_hybrid_u8u32_dot_6x4VL
- sve_hybrid_u8u32_mmla_6x4VL
- sve_interleaved_bf16fp32_dot_8x3VL
- sve_interleaved_bf16fp32_mmla_8x3VL
- sve_interleaved_fp16_mla_8x3VL
- sve_interleaved_fp32_mla_8x3VL
- sve_interleaved_fp32_mmla_8x3VL
- sve_interleaved_s8s32_dot_8x3VL
- sve_interleaved_s8s32_mmla_8x3VL
- sve_interleaved_u8s8s32_mmla_8x3VL
- sve_interleaved_u8u32_dot_8x3VL
- sve_interleaved_u8u32_mmla_8x3VL
- merges
- transforms
- assembly
- utils
- cpu
- kernels/assembly
- arm_common
- internal
- arm_gemm
- operators
- internal
- tests
- validation
- NEON
- fixtures
- reference
Some content is hidden
Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.
663 files changed
+125147
-100273
lines changed| Original file line number | Diff line number | Diff line change | |
|---|---|---|---|
| |||
1 | | - | |
| 1 | + | |
2 | 2 | | |
3 | 3 | | |
4 | 4 | | |
| |||
31 | 31 | | |
32 | 32 | | |
33 | 33 | | |
| 34 | + | |
| 35 | + | |
34 | 36 | | |
35 | 37 | | |
36 | 38 | | |
| |||
88 | 90 | | |
89 | 91 | | |
90 | 92 | | |
91 | | - | |
| 93 | + | |
92 | 94 | | |
93 | 95 | | |
94 | 96 | | |
| |||
| Original file line number | Diff line number | Diff line change | |
|---|---|---|---|
| |||
1 | 1 | | |
2 | | - | |
| 2 | + | |
3 | 3 | | |
4 | 4 | | |
5 | 5 | | |
| |||
345 | 345 | | |
346 | 346 | | |
347 | 347 | | |
348 | | - | |
349 | 348 | | |
350 | 349 | | |
351 | 350 | | |
| |||
1286 | 1285 | | |
1287 | 1286 | | |
1288 | 1287 | | |
1289 | | - | |
1290 | 1288 | | |
1291 | 1289 | | |
1292 | 1290 | | |
| |||
1300 | 1298 | | |
1301 | 1299 | | |
1302 | 1300 | | |
1303 | | - | |
1304 | 1301 | | |
1305 | 1302 | | |
1306 | 1303 | | |
| |||
1312 | 1309 | | |
1313 | 1310 | | |
1314 | 1311 | | |
1315 | | - | |
1316 | 1312 | | |
1317 | 1313 | | |
1318 | 1314 | | |
| |||
1342 | 1338 | | |
1343 | 1339 | | |
1344 | 1340 | | |
| 1341 | + | |
| 1342 | + | |
| 1343 | + | |
| 1344 | + | |
| 1345 | + | |
| 1346 | + | |
| 1347 | + | |
| 1348 | + | |
| 1349 | + | |
| 1350 | + | |
| 1351 | + | |
| 1352 | + | |
| 1353 | + | |
| 1354 | + | |
| 1355 | + | |
| 1356 | + | |
1345 | 1357 | | |
1346 | 1358 | | |
1347 | 1359 | | |
| 1360 | + | |
| 1361 | + | |
| 1362 | + | |
| 1363 | + | |
| 1364 | + | |
| 1365 | + | |
| 1366 | + | |
| 1367 | + | |
| 1368 | + | |
| 1369 | + | |
| 1370 | + | |
| 1371 | + | |
1348 | 1372 | | |
1349 | 1373 | | |
1350 | 1374 | | |
| |||
| Original file line number | Diff line number | Diff line change | |
|---|---|---|---|
| |||
1 | | - | |
| 1 | + | |
2 | 2 | | |
3 | 3 | | |
4 | 4 | | |
| |||
29 | 29 | | |
30 | 30 | | |
31 | 31 | | |
32 | | - | |
| 32 | + | |
33 | 33 | | |
34 | 34 | | |
35 | 35 | | |
| |||
| Original file line number | Diff line number | Diff line change | |
|---|---|---|---|
| |||
1 | 1 | | |
2 | 2 | | |
3 | 3 | | |
4 | | - | |
| 4 | + | |
5 | 5 | | |
6 | 6 | | |
7 | 7 | | |
| |||
650 | 650 | | |
651 | 651 | | |
652 | 652 | | |
| 653 | + | |
| 654 | + | |
653 | 655 | | |
654 | 656 | | |
655 | 657 | | |
| |||
| Original file line number | Diff line number | Diff line change | |
|---|---|---|---|
| |||
1702 | 1702 | | |
1703 | 1703 | | |
1704 | 1704 | | |
1705 | | - | |
1706 | 1705 | | |
1707 | 1706 | | |
1708 | 1707 | | |
| |||
1760 | 1759 | | |
1761 | 1760 | | |
1762 | 1761 | | |
1763 | | - | |
1764 | | - | |
1765 | | - | |
1766 | | - | |
1767 | 1762 | | |
1768 | 1763 | | |
1769 | 1764 | | |
| |||
1778 | 1773 | | |
1779 | 1774 | | |
1780 | 1775 | | |
1781 | | - | |
1782 | 1776 | | |
1783 | 1777 | | |
1784 | 1778 | | |
| |||
1808 | 1802 | | |
1809 | 1803 | | |
1810 | 1804 | | |
| 1805 | + | |
| 1806 | + | |
| 1807 | + | |
| 1808 | + | |
| 1809 | + | |
| 1810 | + | |
| 1811 | + | |
| 1812 | + | |
| 1813 | + | |
| 1814 | + | |
| 1815 | + | |
| 1816 | + | |
| 1817 | + | |
| 1818 | + | |
| 1819 | + | |
| 1820 | + | |
1811 | 1821 | | |
1812 | 1822 | | |
1813 | 1823 | | |
| 1824 | + | |
| 1825 | + | |
| 1826 | + | |
| 1827 | + | |
| 1828 | + | |
| 1829 | + | |
| 1830 | + | |
| 1831 | + | |
| 1832 | + | |
| 1833 | + | |
| 1834 | + | |
| 1835 | + | |
1814 | 1836 | | |
1815 | 1837 | | |
1816 | 1838 | | |
| |||
1857 | 1879 | | |
1858 | 1880 | | |
1859 | 1881 | | |
1860 | | - | |
1861 | | - | |
| 1882 | + | |
1862 | 1883 | | |
1863 | 1884 | | |
1864 | 1885 | | |
| |||
| Original file line number | Diff line number | Diff line change | |
|---|---|---|---|
| |||
1 | 1 | | |
2 | 2 | | |
3 | 3 | | |
4 | | - | |
| 4 | + | |
5 | 5 | | |
6 | 6 | | |
7 | 7 | | |
| |||
95 | 95 | | |
96 | 96 | | |
97 | 97 | | |
98 | | - | |
| 98 | + | |
99 | 99 | | |
100 | 100 | | |
101 | 101 | | |
| |||
0 commit comments