Skip to content

Commit 5d14e5d

Browse files
njsyw1997Kim-Chyan Ganmax-krasnyansky
authored
hexagon: optimization for HMX mat_mul (ggml-org#21554)
* hexagon: add async HMX worker Introduce hmx-worker (dedicated thread for HMX compute) to overlap HMX matmul with HVX dequant/DMA stages in the pipeline path, replacing the previous synchronous HMX calls that blocked the main thread. * hexagon: cost-based VTCM chunk search for out-stationary matmul * hexagon: fix futex race in hmx_worker_drain Store the boolean to local variable avoid atomic load twice * hex-mm: hmx optimize scatter/transpose and use HMX intrinsics * hex-vmem: drop vmem limit a touch under 3GB on v73 * hexagon: add fwd declaration of htp_context * hex-hmx: replace hmx-worker with hmx-queue that mimics dma-queue interface Simplifies the overall implemantion, reduces thread wakeup roundtrips. * hex-mm: add debug log to hmx work func called from hmx-queue * Update hmx-queue.h Co-authored-by: Max Krasnyansky <max.krasnyansky@gmail.com> --------- Co-authored-by: Kim-Chyan Gan <kgan@qti.qualcomm.com> Co-authored-by: Max Krasnyansky <maxk@qti.qualcomm.com> Co-authored-by: Max Krasnyansky <max.krasnyansky@gmail.com>
1 parent fae3a28 commit 5d14e5d

10 files changed

Lines changed: 589 additions & 197 deletions

File tree

ggml/src/ggml-hexagon/htp/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@ list(FIND HTP_HMX_VERSIONS ${DSP_VERSION} _hmx_idx)
4747

4848
if (_hmx_idx GREATER_EQUAL 0)
4949
target_sources(${HTP_LIB} PRIVATE
50+
hmx-queue.c
5051
hmx-matmul-ops.c
5152
)
5253

ggml/src/ggml-hexagon/htp/hex-utils.h

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,14 @@ static inline uint64_t hex_get_pktcnt() {
3131
return pktcnt;
3232
}
3333

34+
static inline uint32_t hex_ceil_pow2(uint32_t x) {
35+
if (x <= 1) { return 1; }
36+
int p = 2;
37+
x--;
38+
while (x >>= 1) { p <<= 1; }
39+
return p;
40+
}
41+
3442
static inline size_t hmx_ceil_div(size_t num, size_t den) {
3543
return (num + den - 1) / den;
3644
}
@@ -73,8 +81,7 @@ static inline void hex_l2fetch(const void * p, uint32_t width, uint32_t stride,
7381
#define HEX_L2_LINE_SIZE 64
7482
#define HEX_L2_FLUSH_SIZE (128 * 1024)
7583

76-
static inline void hex_l2flush(void * addr, size_t size)
77-
{
84+
static inline void hex_l2flush(void * addr, size_t size) {
7885
if (size > HEX_L2_FLUSH_SIZE) {
7986
qurt_mem_cache_clean((qurt_addr_t) 0, 0, QURT_MEM_CACHE_FLUSH_INVALIDATE_ALL, QURT_MEM_DCACHE);
8087
} else {
@@ -89,4 +96,8 @@ static inline void hex_l2flush(void * addr, size_t size)
8996
}
9097
}
9198

99+
static inline void hex_pause() {
100+
asm volatile(" pause(#255)\n");
101+
}
102+
92103
#endif /* HEX_UTILS_H */

ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c

Lines changed: 251 additions & 137 deletions
Large diffs are not rendered by default.
Lines changed: 158 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,158 @@
1+
#pragma clang diagnostic ignored "-Wunused-function"
2+
3+
#include <stdbool.h>
4+
#include <stdlib.h>
5+
#include <string.h>
6+
7+
#include <qurt_thread.h>
8+
#include <qurt_futex.h>
9+
10+
#include <HAP_compute_res.h>
11+
12+
#include "hmx-queue.h"
13+
14+
#define QURT_LOWEST_PRIO (254)
15+
16+
static inline void hmx_lock(struct hmx_queue *q)
17+
{
18+
if (!q->hmx_locked) {
19+
HAP_compute_res_hmx_lock(q->hap_rctx);
20+
q->hmx_locked = true;
21+
}
22+
}
23+
24+
static inline void hmx_unlock(struct hmx_queue *q)
25+
{
26+
if (q->hmx_locked) {
27+
HAP_compute_res_hmx_unlock(q->hap_rctx);
28+
q->hmx_locked = false;
29+
}
30+
}
31+
32+
static inline void hmx_queue_process(struct hmx_queue *q, bool* killed) {
33+
unsigned int ir = atomic_load(&q->idx_read);
34+
35+
while (ir != atomic_load(&q->idx_write)) {
36+
struct hmx_queue_desc *d = &q->desc[ir];
37+
if (!d->done) {
38+
FARF(HIGH, "hmx-queue-process: ir %u func %p data %p", ir, d->func, d->data);
39+
40+
enum hmx_queue_signal sig = (enum hmx_queue_signal) (unsigned int) d->func;
41+
switch (sig) {
42+
case HMX_QUEUE_NOOP: /* noop */; break;
43+
case HMX_QUEUE_KILL: *killed = true; break;
44+
case HMX_QUEUE_SUSPEND: hmx_unlock(q); break;
45+
default:
46+
hmx_lock(q);
47+
d->func(d->data);
48+
break;
49+
}
50+
51+
atomic_fetch_add(&d->done, 1);
52+
}
53+
54+
ir = (ir + 1) & q->idx_mask;
55+
atomic_store(&q->idx_read, ir);
56+
}
57+
}
58+
59+
static void hmx_queue_thread(void * arg) {
60+
struct hmx_queue * q = (struct hmx_queue *) arg;
61+
62+
FARF(HIGH, "hmx-queue-thread: started");
63+
64+
bool killed = false;
65+
66+
unsigned int poll_cnt = HMX_QUEUE_POLL_COUNT;
67+
unsigned int prev_seqn = 0;
68+
while (!killed) {
69+
unsigned int seqn = atomic_load(&q->seqn);
70+
if (seqn == prev_seqn) {
71+
if (--poll_cnt) { hex_pause(); continue; }
72+
FARF(HIGH, "hmx-queue-thread: sleeping");
73+
qurt_futex_wait(&q->seqn, prev_seqn);
74+
continue;
75+
}
76+
prev_seqn = seqn;
77+
poll_cnt = HMX_QUEUE_POLL_COUNT;
78+
79+
FARF(HIGH, "hmx-queue-thread: new work");
80+
81+
hmx_queue_process(q, &killed);
82+
}
83+
84+
FARF(HIGH, "hmx-queue-thread: stopped");
85+
}
86+
87+
struct hmx_queue * hmx_queue_create(size_t capacity, uint32_t hap_rctx) {
88+
capacity = hex_ceil_pow2(capacity);
89+
90+
struct hmx_queue * q = (struct hmx_queue *) memalign(32, sizeof(struct hmx_queue));
91+
if (q == NULL) {
92+
FARF(ERROR, "%s: failed to allocate DMA queue\n", __FUNCTION__);
93+
return NULL;
94+
}
95+
memset(q, 0, sizeof(struct hmx_queue));
96+
q->capacity = capacity;
97+
q->idx_mask = capacity - 1;
98+
q->hap_rctx = hap_rctx;
99+
100+
q->desc = (struct hmx_queue_desc *) memalign(64, capacity * sizeof(struct hmx_queue_desc));
101+
if (!q->desc) {
102+
FARF(ERROR, "hmx-queue: failed to allocate HMX queue descriptors\n");
103+
return NULL;
104+
}
105+
memset(q->desc, 0, capacity * sizeof(struct hmx_queue_desc));
106+
107+
const size_t stack_size = HMX_QUEUE_THREAD_STACK_SIZE;
108+
q->stack = (unsigned char *) memalign(64, stack_size);
109+
if (!q->stack) {
110+
FARF(ERROR, "hmx-queue: thread stack allocation failed (%zu bytes)", stack_size);
111+
return NULL;
112+
}
113+
memset(q->stack, 0, stack_size);
114+
115+
// Match caller thread priority (same pattern as worker-pool.c).
116+
int prio = qurt_thread_get_priority(qurt_thread_get_id());
117+
if (prio < 1) {
118+
prio = 1;
119+
}
120+
if (prio > QURT_LOWEST_PRIO) {
121+
prio = QURT_LOWEST_PRIO;
122+
}
123+
124+
qurt_thread_attr_t attr;
125+
qurt_thread_attr_init(&attr);
126+
qurt_thread_attr_set_stack_addr(&attr, q->stack);
127+
qurt_thread_attr_set_stack_size(&attr, stack_size);
128+
qurt_thread_attr_set_priority(&attr, prio);
129+
qurt_thread_attr_set_name(&attr, "hmx-queue");
130+
131+
int err = qurt_thread_create(&q->thread, &attr, hmx_queue_thread, q);
132+
if (err) {
133+
FARF(ERROR, "hmx-worker: thread create failed (%d)", err);
134+
return NULL;
135+
}
136+
137+
FARF(HIGH, "hmx-queue: capacity %u\n", capacity);
138+
139+
return q;
140+
}
141+
142+
void hmx_queue_delete(struct hmx_queue * q) {
143+
if (!q) {
144+
return;
145+
}
146+
147+
// Tell the worker to exit.
148+
hmx_queue_flush(q);
149+
hmx_queue_signal(q, HMX_QUEUE_KILL);
150+
hmx_queue_flush(q);
151+
152+
int status;
153+
qurt_thread_join(q->thread, &status);
154+
155+
free(q->desc);
156+
free(q->stack);
157+
free(q);
158+
}
Lines changed: 134 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,134 @@
1+
#ifndef HMX_QUEUE_H
2+
#define HMX_QUEUE_H
3+
4+
#include <stdbool.h>
5+
#include <stdint.h>
6+
#include <stdatomic.h>
7+
8+
#include <hexagon_types.h>
9+
#include <qurt_thread.h>
10+
#include <qurt_futex.h>
11+
#include <HAP_farf.h>
12+
13+
#include "hex-utils.h"
14+
15+
#ifdef __cplusplus
16+
extern "C" {
17+
#endif
18+
19+
#define HMX_QUEUE_THREAD_STACK_SIZE (16 * 1024)
20+
#define HMX_QUEUE_POLL_COUNT 2000
21+
22+
typedef void (*hmx_queue_func)(void *);
23+
24+
// Dummy funcs used as signals
25+
enum hmx_queue_signal {
26+
HMX_QUEUE_NOOP = 0, // aka NULL
27+
HMX_QUEUE_SUSPEND,
28+
HMX_QUEUE_KILL
29+
};
30+
31+
struct hmx_queue_desc {
32+
hmx_queue_func func;
33+
void * data;
34+
atomic_uint done;
35+
};
36+
37+
struct hmx_queue {
38+
struct hmx_queue_desc * desc;
39+
atomic_uint idx_write; // updated by producer (push)
40+
atomic_uint idx_read; // updated by consumer (process)
41+
unsigned int idx_pop; // updated by producer (pop)
42+
uint32_t idx_mask;
43+
uint32_t capacity;
44+
45+
atomic_uint seqn; // incremented for all pushes, used with futex
46+
qurt_thread_t thread;
47+
void * stack;
48+
uint32_t hap_rctx;
49+
bool hmx_locked;
50+
};
51+
52+
struct hmx_queue * hmx_queue_create(size_t capacity, uint32_t hap_rctx);
53+
void hmx_queue_delete(struct hmx_queue * q);
54+
55+
static inline struct hmx_queue_desc hmx_queue_make_desc(hmx_queue_func func, void * data) {
56+
struct hmx_queue_desc d = { func, data };
57+
return d;
58+
}
59+
60+
static inline bool hmx_queue_push(struct hmx_queue * q, struct hmx_queue_desc d) {
61+
unsigned int ir = atomic_load(&q->idx_read);
62+
unsigned int iw = q->idx_write;
63+
64+
if (((iw + 1) & q->idx_mask) == ir) {
65+
FARF(HIGH, "hmx-queue-push: queue is full\n");
66+
return false;
67+
}
68+
69+
atomic_store(&d.done, 0);
70+
71+
FARF(HIGH, "hmx-queue-push: iw %u func %p data %p\n", iw, d.func, d.data);
72+
73+
q->desc[iw] = d;
74+
atomic_store(&q->idx_write, (iw + 1) & q->idx_mask);
75+
// wake up our thread
76+
atomic_fetch_add(&q->seqn, 1);
77+
qurt_futex_wake(&q->seqn, 1);
78+
79+
return true;
80+
}
81+
82+
static inline bool hmx_queue_signal(struct hmx_queue *q, enum hmx_queue_signal sig) {
83+
return hmx_queue_push(q, hmx_queue_make_desc((hmx_queue_func) sig, NULL));
84+
}
85+
86+
static inline bool hmx_queue_empty(struct hmx_queue * q) {
87+
return q->idx_pop == q->idx_write;
88+
}
89+
90+
static inline uint32_t hmx_queue_depth(struct hmx_queue * q) {
91+
return (q->idx_read - q->idx_read) & q->idx_mask;
92+
}
93+
94+
static inline uint32_t hmx_queue_capacity(struct hmx_queue * q) {
95+
return q->capacity;
96+
}
97+
98+
static inline struct hmx_queue_desc hmx_queue_pop(struct hmx_queue * q) {
99+
unsigned int ip = q->idx_pop;
100+
unsigned int iw = q->idx_write;
101+
102+
struct hmx_queue_desc rd = { NULL, NULL };
103+
if (ip == iw) {
104+
return rd;
105+
}
106+
107+
// Wait for desc to complete
108+
struct hmx_queue_desc * d = &q->desc[ip];
109+
while (!atomic_load(&d->done)) {
110+
FARF(HIGH, "hmx-queue-pop: waiting for HMX queue : %u\n", ip);
111+
hex_pause();
112+
}
113+
114+
rd = *d;
115+
q->idx_pop = (ip + 1) & q->idx_mask;
116+
117+
FARF(HIGH, "hmx-queue-pop: ip %u func %p data %p\n", ip, rd.func, rd.data);
118+
return rd;
119+
}
120+
121+
static inline void hmx_queue_flush(struct hmx_queue * q) {
122+
while (hmx_queue_pop(q).func != NULL) ;
123+
}
124+
125+
static inline void hmx_queue_suspend(struct hmx_queue *q) {
126+
hmx_queue_signal(q, HMX_QUEUE_SUSPEND);
127+
hmx_queue_flush(q);
128+
}
129+
130+
#ifdef __cplusplus
131+
} // extern "C"
132+
#endif
133+
134+
#endif /* HMX_QUEUE_H */

0 commit comments

Comments
 (0)