Skip to content
238 changes: 143 additions & 95 deletions ggml/src/ggml-hexagon/ggml-hexagon.cpp

Large diffs are not rendered by default.

4 changes: 3 additions & 1 deletion ggml/src/ggml-hexagon/htp/htp-ctx.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ struct htp_mmap {
uint64_t size;
uint64_t base;
uint32_t fd;
uint32_t pinned;
uint32_t reserved;
};

// Scratchpad state
Expand Down Expand Up @@ -77,6 +77,8 @@ struct htp_context {
atomic_bool vtcm_valid;
atomic_bool vtcm_needs_release;

uint64_t max_vmem;

struct htp_ops_context octx;

#ifdef HTP_HAS_HMX
Expand Down
8 changes: 2 additions & 6 deletions ggml/src/ggml-hexagon/htp/htp-ops.h
Original file line number Diff line number Diff line change
Expand Up @@ -90,15 +90,11 @@ enum htp_op_code {
#define HTP_OP_MAX_INPUTS 6 // aka GGML_MAX_SRCS
#define HTP_OP_MAX_PARAMS 16 // aka GGML_MAX_OP_PARAMS

#define HTP_OP_MAX_BUFS 8
#define HTP_OP_MAX_BUFS 16
#define HTP_OP_MAX_REQS 256
#define HTP_OP_MAX_TENSORS (HTP_OP_MAX_REQS * HTP_OP_MAX_INPUTS + HTP_OP_MAX_REQS)

#if __HVX_ARCH__ < 75
#define HTP_OP_MAX_VMEM (3167538380u)
#else
#define HTP_OP_MAX_VMEM (3221225472u)
#endif
#define HTP_OP_MAX_VMEM_DEFAULT (3355443200u)

#define HTP_MMAP_MAX_VMEM (2147483648u)

Expand Down
4 changes: 2 additions & 2 deletions ggml/src/ggml-hexagon/htp/htp_iface.idl
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,9 @@ struct htp_iface_pmu_conf {
};

interface htp_iface : remote_handle64 {
AEEResult start(in uint32 sess_id, in uint64 dsp_queue_id, in uint32 n_hvx, in uint32 use_hmx);
AEEResult start(in uint32 sess_id, in uint64 dsp_queue_id, in uint32 n_hvx, in uint32 use_hmx, in uint64 max_vmem);
AEEResult stop();
AEEResult mmap(in uint32 fd, in uint32 size, in uint32 pinned);
AEEResult mmap(in uint32 fd, in uint32 size);
AEEResult munmap(in uint32 fd);
AEEResult profiler(in uint32 mode, in htp_iface_pmu_conf pmu);
AEEResult etm(in uint32 enable);
Expand Down
27 changes: 12 additions & 15 deletions ggml/src/ggml-hexagon/htp/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -210,7 +210,7 @@ AEEResult htp_iface_close(remote_handle64 handle) {
return AEE_SUCCESS;
}

AEEResult htp_iface_mmap(remote_handle64 handle, uint32 fd, uint32 size, uint32 pinned) {
AEEResult htp_iface_mmap(remote_handle64 handle, uint32_t fd, uint32_t size) {
struct htp_context * ctx = (struct htp_context *) handle;
if (!ctx) {
return AEE_EBADPARM;
Expand All @@ -220,7 +220,6 @@ AEEResult htp_iface_mmap(remote_handle64 handle, uint32 fd, uint32 size, uint32
for (uint32_t i=0; i<HTP_MAX_MMAPS; i++) {
struct htp_mmap *m = &ctx->mmap[i];
if (m->fd == fd) {
m->pinned = pinned;
return AEE_SUCCESS;
}
}
Expand All @@ -229,7 +228,7 @@ AEEResult htp_iface_mmap(remote_handle64 handle, uint32 fd, uint32 size, uint32
for (uint32_t i=0; i<HTP_MAX_MMAPS; i++) {
struct htp_mmap *m = &ctx->mmap[i];
if (!m->size) {
FARF(HIGH, "mmap : fd %u size %u pinned %u", fd, size, pinned);
FARF(HIGH, "mmap : fd %u size %u", fd, size);
#if __HVX_ARCH__ > 73
void *va = HAP_mmap2(NULL, size, HAP_PROT_READ | HAP_PROT_WRITE, 0, fd, 0);
#else
Expand All @@ -248,7 +247,6 @@ AEEResult htp_iface_mmap(remote_handle64 handle, uint32 fd, uint32 size, uint32
m->base = (uint64_t) va;
m->fd = fd;
m->size = size;
m->pinned = pinned;

return AEE_SUCCESS;
}
Expand All @@ -275,7 +273,6 @@ AEEResult htp_iface_munmap(remote_handle64 handle, uint32 fd) {
m->size = 0;
m->base = NULL;
m->fd = -1;
m->pinned = 0;
}
}

Expand Down Expand Up @@ -358,7 +355,7 @@ static void vtcm_free(struct htp_context * ctx) {
static void htp_packet_callback(dspqueue_t queue, int error, void * context);
static void htp_error_callback(dspqueue_t queue, int error, void * context);

AEEResult htp_iface_start(remote_handle64 handle, uint32 sess_id, uint64 dsp_queue_id, uint32 n_hvx, uint32 use_hmx) {
AEEResult htp_iface_start(remote_handle64 handle, uint32 sess_id, uint64 dsp_queue_id, uint32 n_hvx, uint32 use_hmx, uint64_t max_vmem) {
struct htp_context * ctx = (struct htp_context *) handle;

if (!ctx) {
Expand All @@ -376,12 +373,12 @@ AEEResult htp_iface_start(remote_handle64 handle, uint32 sess_id, uint64 dsp_que
htp_error_callback, // Error callback; no errors expected on the DSP
(void *) ctx, // Callback context
&ctx->queue);

if (err) {
FARF(ERROR, "Queue import failed with 0x%08x", (unsigned) err);
return err;
}

ctx->max_vmem = max_vmem;
ctx->thread_id = qurt_thread_get_id();
ctx->thread_prio = qurt_thread_get_priority(ctx->thread_id);

Expand Down Expand Up @@ -622,8 +619,8 @@ static inline bool reuse_buf(struct htp_context *ctx, uint32_t *m_reuse, struct
}

static inline void drop_mmap(struct htp_context *ctx, struct htp_mmap *m) {
if (m->size && !m->pinned) {
FARF(HIGH, "unmap : fd %u base %p size %u pinned %u", m->fd, (void*) m->base, (uint32_t) m->size, m->pinned);
if (m->size) {
FARF(HIGH, "unmap : fd %u base %p size %u", m->fd, (void*) m->base, (uint32_t) m->size);
#if __HVX_ARCH__ > 73
HAP_munmap2((void *) m->base, m->size);
#else
Expand Down Expand Up @@ -660,9 +657,8 @@ static inline void mmap_buf(struct htp_context *ctx, struct htp_buf_desc *b) {
m->base = b->base = (uint64_t) va;
m->fd = b->fd;
m->size = b->size;
m->pinned = 0;

FARF(HIGH, "mmap : fd %u base %p size %u pinned %u", m->fd, (void*) m->base, (uint32_t) m->size, m->pinned);
FARF(HIGH, "mmap : fd %u base %p size %u", m->fd, (void*) m->base, (uint32_t) m->size);
return;
}
}
Expand All @@ -672,8 +668,8 @@ static void prep_op_bufs(struct htp_context *ctx, struct htp_buf_desc *bufs, uin
uint32_t m_reuse = 0; // mmap reuse mask (index from ctx->mmap array)
uint32_t b_reuse = 0; // buf reuse count

size_t m_vmem = 0; // mapped vmem
size_t e_vmem = 0; // extra vmem
uint64_t m_vmem = 0; // mapped vmem
uint64_t e_vmem = 0; // extra vmem

// See what we can reuse
for (uint32_t i=0; i < n_bufs; i++) {
Expand All @@ -687,9 +683,10 @@ static void prep_op_bufs(struct htp_context *ctx, struct htp_buf_desc *bufs, uin
// See how much vmem we have mmaped right now
for (uint32_t i=0; i<HTP_MAX_MMAPS; i++) { m_vmem += ctx->mmap[i].size; }

FARF(HIGH, "prep-bufs : pass1 mmap-vmem %zu extra-vmem %zu n-bufs %u b-reuse %u", m_vmem, e_vmem, n_bufs, b_reuse);
FARF(HIGH, "prep-bufs : pass1 mmap-vmem %zu extra-vmem %zu max-vmem %zu : n-bufs %u b-reuse %u",
(size_t) m_vmem, (size_t) e_vmem, (size_t) ctx->max_vmem, n_bufs, b_reuse);

if ((m_vmem + e_vmem) > HTP_OP_MAX_VMEM) {
if ((m_vmem + e_vmem) > ctx->max_vmem) {
// Drop unused mappings
for (uint32_t i=0; i < HTP_MAX_MMAPS; i++) {
bool used = m_reuse & (1<<i);
Expand Down
12 changes: 11 additions & 1 deletion scripts/snapdragon/adb/run-cli.sh
Original file line number Diff line number Diff line change
Expand Up @@ -54,13 +54,23 @@ opqueue=
opflt=
[ "$OF" != "" ] && opflt="GGML_HEXAGON_OPFILTER=$OF"

vmem=
[ "$VM" != "" ] && opflt="GGML_HEXAGON_VMEM=$VM"

mbuf=
[ "$MB" != "" ] && opflt="GGML_HEXAGON_MBUF=$MB"
vmem=
[ "$VM" != "" ] && vmem="GGML_HEXAGON_VMEM=$VM"

Comment thread
max-krasnyansky marked this conversation as resolved.
mbuf=
[ "$MB" != "" ] && mbuf="GGML_HEXAGON_MBUF=$MB"
set -x

adb $adbserial $adbhost shell " \
cd $basedir; ulimit -c unlimited; \
LD_LIBRARY_PATH=$basedir/$branch/lib \
ADSP_LIBRARY_PATH=$basedir/$branch/lib \
$verbose $sched $opmask $profile $nhvx $hmx $ndev $hb $opbatch $opqueue $opflt \
$verbose $sched $opmask $profile $nhvx $hmx $ndev $hb $opbatch $opqueue $opflt $vmem $mbuf \
./$branch/bin/llama-cli --no-mmap -m $basedir/../gguf/$model \
--poll 1000 -t 6 --cpu-mask 0xfc --cpu-strict 1 \
--ctx-size 8192 --ubatch-size 256 -fa on \
Expand Down
8 changes: 7 additions & 1 deletion scripts/snapdragon/adb/run-completion.sh
Original file line number Diff line number Diff line change
Expand Up @@ -54,13 +54,19 @@ opqueue=
opflt=
[ "$OF" != "" ] && opflt="GGML_HEXAGON_OPFILTER=$OF"

vmem=
[ "$VM" != "" ] && vmem="GGML_HEXAGON_VMEM=$VM"

mbuf=
[ "$MB" != "" ] && mbuf="GGML_HEXAGON_MBUF=$MB"

set -x

adb $adbserial $adbhost shell " \
cd $basedir; ulimit -c unlimited; \
LD_LIBRARY_PATH=$basedir/$branch/lib \
ADSP_LIBRARY_PATH=$basedir/$branch/lib \
$verbose $sched $opmask $profile $nhvx $hmx $ndev $hb $opbatch $opqueue $opflt \
$verbose $sched $opmask $profile $nhvx $hmx $ndev $hb $opbatch $opqueue $opflt $vmem $mbuf \
./$branch/bin/llama-completion --no-mmap -m $basedir/../gguf/$model \
--poll 1000 -t 6 --cpu-mask 0xfc --cpu-strict 1 \
--ctx-size 8192 --ubatch-size 256 -fa on \
Expand Down
Loading