diff --git a/.devops/cann.Dockerfile b/.devops/cann.Dockerfile index 843fe37d0625..acd1e26bcec8 100644 --- a/.devops/cann.Dockerfile +++ b/.devops/cann.Dockerfile @@ -5,6 +5,9 @@ # Define the CANN base image for easier version updates later ARG CHIP_TYPE=910b ARG CANN_BASE_IMAGE=quay.io/ascend/cann:8.5.0-${CHIP_TYPE}-openeuler24.03-py3.11 +ARG BUILD_DATE=N/A +ARG APP_VERSION=N/A +ARG APP_REVISION=N/A # ============================================================================== # BUILD STAGE @@ -67,6 +70,19 @@ RUN mkdir -p /app/full && \ # ============================================================================== FROM ${CANN_BASE_IMAGE} AS base +ARG BUILD_DATE=N/A +ARG APP_VERSION=N/A +ARG APP_REVISION=N/A +ARG IMAGE_URL=https://github.com/ggml-org/llama.cpp +ARG IMAGE_SOURCE=https://github.com/ggml-org/llama.cpp +LABEL org.opencontainers.image.created=$BUILD_DATE \ + org.opencontainers.image.version=$APP_VERSION \ + org.opencontainers.image.revision=$APP_REVISION \ + org.opencontainers.image.title="llama.cpp" \ + org.opencontainers.image.description="LLM inference in C/C++" \ + org.opencontainers.image.url=$IMAGE_URL \ + org.opencontainers.image.source=$IMAGE_SOURCE + # -- Install runtime dependencies -- RUN yum install -y libgomp curl && \ yum clean all && \ diff --git a/.devops/cpu.Dockerfile b/.devops/cpu.Dockerfile index d6579ecf1ad3..c8f32235d162 100644 --- a/.devops/cpu.Dockerfile +++ b/.devops/cpu.Dockerfile @@ -1,4 +1,7 @@ ARG UBUNTU_VERSION=24.04 +ARG BUILD_DATE=N/A +ARG APP_VERSION=N/A +ARG APP_REVISION=N/A FROM ubuntu:$UBUNTU_VERSION AS build @@ -35,6 +38,19 @@ RUN mkdir -p /app/full \ ## Base image FROM ubuntu:$UBUNTU_VERSION AS base +ARG BUILD_DATE=N/A +ARG APP_VERSION=N/A +ARG APP_REVISION=N/A +ARG IMAGE_URL=https://github.com/ggml-org/llama.cpp +ARG IMAGE_SOURCE=https://github.com/ggml-org/llama.cpp +LABEL org.opencontainers.image.created=$BUILD_DATE \ + org.opencontainers.image.version=$APP_VERSION \ + org.opencontainers.image.revision=$APP_REVISION \ + org.opencontainers.image.title="llama.cpp" \ + org.opencontainers.image.description="LLM inference in C/C++" \ + org.opencontainers.image.url=$IMAGE_URL \ + org.opencontainers.image.source=$IMAGE_SOURCE + RUN apt-get update \ && apt-get install -y libgomp1 curl \ && apt autoremove -y \ diff --git a/.devops/cuda.Dockerfile b/.devops/cuda.Dockerfile index b3f6ccfc9845..3805ea3a0096 100644 --- a/.devops/cuda.Dockerfile +++ b/.devops/cuda.Dockerfile @@ -6,6 +6,10 @@ ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VER ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION} +ARG BUILD_DATE=N/A +ARG APP_VERSION=N/A +ARG APP_REVISION=N/A + FROM ${BASE_CUDA_DEV_CONTAINER} AS build # CUDA architecture to build for (defaults to all supported archs) @@ -40,6 +44,19 @@ RUN mkdir -p /app/full \ ## Base image FROM ${BASE_CUDA_RUN_CONTAINER} AS base +ARG BUILD_DATE=N/A +ARG APP_VERSION=N/A +ARG APP_REVISION=N/A +ARG IMAGE_URL=https://github.com/ggml-org/llama.cpp +ARG IMAGE_SOURCE=https://github.com/ggml-org/llama.cpp +LABEL org.opencontainers.image.created=$BUILD_DATE \ + org.opencontainers.image.version=$APP_VERSION \ + org.opencontainers.image.revision=$APP_REVISION \ + org.opencontainers.image.title="llama.cpp" \ + org.opencontainers.image.description="LLM inference in C/C++" \ + org.opencontainers.image.url=$IMAGE_URL \ + org.opencontainers.image.source=$IMAGE_SOURCE + RUN apt-get update \ && apt-get install -y libgomp1 curl \ && apt autoremove -y \ diff --git a/.devops/intel.Dockerfile b/.devops/intel.Dockerfile index da164dcfa5ba..218418b80b67 100644 --- a/.devops/intel.Dockerfile +++ b/.devops/intel.Dockerfile @@ -1,4 +1,7 @@ ARG ONEAPI_VERSION=2025.3.3-0-devel-ubuntu24.04 +ARG BUILD_DATE=N/A +ARG APP_VERSION=N/A +ARG APP_REVISION=N/A ## Build Image @@ -40,6 +43,19 @@ RUN mkdir -p /app/full \ FROM intel/deep-learning-essentials:$ONEAPI_VERSION AS base +ARG BUILD_DATE=N/A +ARG APP_VERSION=N/A +ARG APP_REVISION=N/A +ARG IMAGE_URL=https://github.com/ggml-org/llama.cpp +ARG IMAGE_SOURCE=https://github.com/ggml-org/llama.cpp +LABEL org.opencontainers.image.created=$BUILD_DATE \ + org.opencontainers.image.version=$APP_VERSION \ + org.opencontainers.image.revision=$APP_REVISION \ + org.opencontainers.image.title="llama.cpp" \ + org.opencontainers.image.description="LLM inference in C/C++" \ + org.opencontainers.image.url=$IMAGE_URL \ + org.opencontainers.image.source=$IMAGE_SOURCE + ARG IGC_VERSION=v2.20.5 ARG IGC_VERSION_FULL=2_2.20.5+19972 ARG COMPUTE_RUNTIME_VERSION=25.40.35563.10 diff --git a/.devops/llama-cli-cann.Dockerfile b/.devops/llama-cli-cann.Dockerfile index d54e70838f23..447d871ac4fb 100644 --- a/.devops/llama-cli-cann.Dockerfile +++ b/.devops/llama-cli-cann.Dockerfile @@ -1,4 +1,7 @@ ARG ASCEND_VERSION=8.5.0-910b-openeuler22.03-py3.10 +ARG BUILD_DATE=N/A +ARG APP_VERSION=N/A +ARG APP_REVISION=N/A FROM ascendai/cann:$ASCEND_VERSION AS build @@ -28,6 +31,20 @@ RUN echo "Building with static libs" && \ # TODO: use image with NNRT FROM ascendai/cann:$ASCEND_VERSION AS runtime + +ARG BUILD_DATE=N/A +ARG APP_VERSION=N/A +ARG APP_REVISION=N/A +ARG IMAGE_URL=https://github.com/ggml-org/llama.cpp +ARG IMAGE_SOURCE=https://github.com/ggml-org/llama.cpp +LABEL org.opencontainers.image.created=$BUILD_DATE \ + org.opencontainers.image.version=$APP_VERSION \ + org.opencontainers.image.revision=$APP_REVISION \ + org.opencontainers.image.title="llama.cpp" \ + org.opencontainers.image.description="LLM inference in C/C++" \ + org.opencontainers.image.url=$IMAGE_URL \ + org.opencontainers.image.source=$IMAGE_SOURCE + COPY --from=build /app/build/bin/llama-cli /app/build/bin/llama-completion / ENV LC_ALL=C.utf8 diff --git a/.devops/musa.Dockerfile b/.devops/musa.Dockerfile index 665a76f58ce2..a7f70b5f0df6 100644 --- a/.devops/musa.Dockerfile +++ b/.devops/musa.Dockerfile @@ -6,6 +6,10 @@ ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_V ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}-amd64 +ARG BUILD_DATE=N/A +ARG APP_VERSION=N/A +ARG APP_REVISION=N/A + FROM ${BASE_MUSA_DEV_CONTAINER} AS build # MUSA architecture to build for (defaults to all supported archs) @@ -45,6 +49,19 @@ RUN mkdir -p /app/full \ ## Base image FROM ${BASE_MUSA_RUN_CONTAINER} AS base +ARG BUILD_DATE=N/A +ARG APP_VERSION=N/A +ARG APP_REVISION=N/A +ARG IMAGE_URL=https://github.com/ggml-org/llama.cpp +ARG IMAGE_SOURCE=https://github.com/ggml-org/llama.cpp +LABEL org.opencontainers.image.created=$BUILD_DATE \ + org.opencontainers.image.version=$APP_VERSION \ + org.opencontainers.image.revision=$APP_REVISION \ + org.opencontainers.image.title="llama.cpp" \ + org.opencontainers.image.description="LLM inference in C/C++" \ + org.opencontainers.image.url=$IMAGE_URL \ + org.opencontainers.image.source=$IMAGE_SOURCE + RUN apt-get update \ && apt-get install -y libgomp1 curl \ && apt autoremove -y \ diff --git a/.devops/openvino.Dockerfile b/.devops/openvino.Dockerfile index 31b58736d7e5..1266713f3788 100644 --- a/.devops/openvino.Dockerfile +++ b/.devops/openvino.Dockerfile @@ -18,6 +18,10 @@ ARG LIBZE1_VERSION=1.27.0-1~24.04~ppa2 ARG http_proxy= ARG https_proxy= +ARG BUILD_DATE=N/A +ARG APP_VERSION=N/A +ARG APP_REVISION=N/A + ## Build Image FROM ubuntu:${UBUNTU_VERSION} AS build @@ -88,6 +92,18 @@ FROM ubuntu:${UBUNTU_VERSION} AS base # Pass proxy args to runtime stage ARG http_proxy ARG https_proxy +ARG BUILD_DATE=N/A +ARG APP_VERSION=N/A +ARG APP_REVISION=N/A +ARG IMAGE_URL=https://github.com/ggml-org/llama.cpp +ARG IMAGE_SOURCE=https://github.com/ggml-org/llama.cpp +LABEL org.opencontainers.image.created=$BUILD_DATE \ + org.opencontainers.image.version=$APP_VERSION \ + org.opencontainers.image.revision=$APP_REVISION \ + org.opencontainers.image.title="llama.cpp" \ + org.opencontainers.image.description="LLM inference in C/C++" \ + org.opencontainers.image.url=$IMAGE_URL \ + org.opencontainers.image.source=$IMAGE_SOURCE RUN apt-get update \ && apt-get install -y libgomp1 libtbb12 curl wget ocl-icd-libopencl1 \ diff --git a/.devops/rocm.Dockerfile b/.devops/rocm.Dockerfile index 525ddc790511..2da15975d13f 100644 --- a/.devops/rocm.Dockerfile +++ b/.devops/rocm.Dockerfile @@ -7,6 +7,10 @@ ARG AMDGPU_VERSION=7.2.1 # Target the ROCm build image ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete +ARG BUILD_DATE=N/A +ARG APP_VERSION=N/A +ARG APP_REVISION=N/A + ### Build image FROM ${BASE_ROCM_DEV_CONTAINER} AS build @@ -57,6 +61,19 @@ RUN mkdir -p /app/full \ ## Base image FROM ${BASE_ROCM_DEV_CONTAINER} AS base +ARG BUILD_DATE=N/A +ARG APP_VERSION=N/A +ARG APP_REVISION=N/A +ARG IMAGE_URL=https://github.com/ggml-org/llama.cpp +ARG IMAGE_SOURCE=https://github.com/ggml-org/llama.cpp +LABEL org.opencontainers.image.created=$BUILD_DATE \ + org.opencontainers.image.version=$APP_VERSION \ + org.opencontainers.image.revision=$APP_REVISION \ + org.opencontainers.image.title="llama.cpp" \ + org.opencontainers.image.description="LLM inference in C/C++" \ + org.opencontainers.image.url=$IMAGE_URL \ + org.opencontainers.image.source=$IMAGE_SOURCE + RUN apt-get update \ && apt-get install -y libgomp1 curl \ && apt autoremove -y \ diff --git a/.devops/s390x.Dockerfile b/.devops/s390x.Dockerfile index 757cd97cd4cc..d36f5f3ccc5d 100644 --- a/.devops/s390x.Dockerfile +++ b/.devops/s390x.Dockerfile @@ -1,5 +1,8 @@ ARG GCC_VERSION=15.2.0 ARG UBUNTU_VERSION=24.04 +ARG BUILD_DATE=N/A +ARG APP_VERSION=N/A +ARG APP_REVISION=N/A ### Build Llama.cpp stage FROM gcc:${GCC_VERSION} AS build @@ -52,6 +55,19 @@ COPY --from=build /opt/llama.cpp/gguf-py /llama.cpp/gguf-py ### Base image FROM ubuntu:${UBUNTU_VERSION} AS base +ARG BUILD_DATE=N/A +ARG APP_VERSION=N/A +ARG APP_REVISION=N/A +ARG IMAGE_URL=https://github.com/ggml-org/llama.cpp +ARG IMAGE_SOURCE=https://github.com/ggml-org/llama.cpp +LABEL org.opencontainers.image.created=$BUILD_DATE \ + org.opencontainers.image.version=$APP_VERSION \ + org.opencontainers.image.revision=$APP_REVISION \ + org.opencontainers.image.title="llama.cpp" \ + org.opencontainers.image.description="LLM inference in C/C++" \ + org.opencontainers.image.url=$IMAGE_URL \ + org.opencontainers.image.source=$IMAGE_SOURCE + RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \ --mount=type=cache,target=/var/lib/apt/lists,sharing=locked \ apt update -y && \ diff --git a/.devops/vulkan.Dockerfile b/.devops/vulkan.Dockerfile index f4d199ed4261..464ccfef1ce4 100644 --- a/.devops/vulkan.Dockerfile +++ b/.devops/vulkan.Dockerfile @@ -1,4 +1,7 @@ ARG UBUNTU_VERSION=26.04 +ARG BUILD_DATE=N/A +ARG APP_VERSION=N/A +ARG APP_REVISION=N/A FROM ubuntu:$UBUNTU_VERSION AS build @@ -31,6 +34,19 @@ RUN mkdir -p /app/full \ ## Base image FROM ubuntu:$UBUNTU_VERSION AS base +ARG BUILD_DATE=N/A +ARG APP_VERSION=N/A +ARG APP_REVISION=N/A +ARG IMAGE_URL=https://github.com/ggml-org/llama.cpp +ARG IMAGE_SOURCE=https://github.com/ggml-org/llama.cpp +LABEL org.opencontainers.image.created=$BUILD_DATE \ + org.opencontainers.image.version=$APP_VERSION \ + org.opencontainers.image.revision=$APP_REVISION \ + org.opencontainers.image.title="llama.cpp" \ + org.opencontainers.image.description="LLM inference in C/C++" \ + org.opencontainers.image.url=$IMAGE_URL \ + org.opencontainers.image.source=$IMAGE_SOURCE + RUN apt-get update \ && apt-get install -y libgomp1 curl libvulkan1 mesa-vulkan-drivers \ libglvnd0 libgl1 libglx0 libegl1 libgles2 \ diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index a5bae7141fe1..6f1f2721e45f 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -11,6 +11,11 @@ name: Publish Docker image on: workflow_dispatch: # allows manual triggering + inputs: + skip_s390x: + description: "Skip the s390x build target (useful for fast test runs that do not need the IBM Z runner)" + type: boolean + default: false schedule: # Rebuild daily rather than on every push because it is expensive - cron: '12 4 * * *' @@ -64,6 +69,8 @@ jobs: - name: Generate build and merge matrices id: matrices shell: bash + env: + SKIP_S390X: ${{ inputs.skip_s390x || 'false' }} run: | set -euo pipefail @@ -86,6 +93,11 @@ jobs: ] JSON + if [ "${SKIP_S390X}" = "true" ]; then + jq 'map(select(.platforms != "linux/s390x"))' build-matrix.json > build-matrix.json.tmp + mv build-matrix.json.tmp build-matrix.json + fi + BUILD_MATRIX="$(jq -c . build-matrix.json)" MERGE_MATRIX="$(jq -c ' reduce .[] as $entry ({}; .[$entry.tag] |= ( @@ -132,6 +144,7 @@ jobs: config: ${{ fromJSON(needs.prepare_matrices.outputs.build_matrix) }} steps: - name: Check out the repo + id: checkout uses: actions/checkout@v6 with: fetch-depth: 0 @@ -187,6 +200,10 @@ jobs: env: GITHUB_REPOSITORY_OWNER: '${{ github.repository_owner }}' + - name: Get build date + id: build_date + run: echo "date=$(date -u +"%Y-%m-%dT%H:%M:%SZ")" >> $GITHUB_OUTPUT + - name: Free Disk Space (Ubuntu) if: ${{ matrix.config.free_disk_space == true }} uses: ggml-org/free-disk-space@v1.3.1 @@ -211,13 +228,26 @@ jobs: with: context: . platforms: ${{ matrix.config.platforms }} - outputs: type=image,name=${{ steps.meta.outputs.image_repo }},push-by-digest=true,name-canonical=true,push=true + outputs: type=image,name=${{ steps.meta.outputs.image_repo }},push-by-digest=true,name-canonical=true,push=true,oci-mediatypes=true file: ${{ matrix.config.dockerfile }} target: full provenance: false build-args: | + BUILD_DATE=${{ steps.build_date.outputs.date }} + APP_VERSION=${{ needs.create_tag.outputs.source_tag }} + APP_REVISION=${{ steps.checkout.outputs.commit }} + IMAGE_URL=${{ github.server_url }}/${{ github.repository }} + IMAGE_SOURCE=${{ github.server_url }}/${{ github.repository }} ${{ matrix.config.ubuntu_version && format('UBUNTU_VERSION={0}', matrix.config.ubuntu_version) || '' }} ${{ matrix.config.cuda_version && format('CUDA_VERSION={0}', matrix.config.cuda_version) || '' }} + annotations: | + manifest:org.opencontainers.image.created=${{ steps.build_date.outputs.date }} + manifest:org.opencontainers.image.version=${{ needs.create_tag.outputs.source_tag }} + manifest:org.opencontainers.image.revision=${{ steps.checkout.outputs.commit }} + manifest:org.opencontainers.image.title=llama.cpp + manifest:org.opencontainers.image.description=LLM inference in C/C++ + manifest:org.opencontainers.image.url=${{ github.server_url }}/${{ github.repository }} + manifest:org.opencontainers.image.source=${{ github.server_url }}/${{ github.repository }} # using github experimental cache #cache-from: type=gha #cache-to: type=gha,mode=max @@ -235,13 +265,26 @@ jobs: with: context: . platforms: ${{ matrix.config.platforms }} - outputs: type=image,name=${{ steps.meta.outputs.image_repo }},push-by-digest=true,name-canonical=true,push=true + outputs: type=image,name=${{ steps.meta.outputs.image_repo }},push-by-digest=true,name-canonical=true,push=true,oci-mediatypes=true file: ${{ matrix.config.dockerfile }} target: light provenance: false build-args: | + BUILD_DATE=${{ steps.build_date.outputs.date }} + APP_VERSION=${{ needs.create_tag.outputs.source_tag }} + APP_REVISION=${{ steps.checkout.outputs.commit }} + IMAGE_URL=${{ github.server_url }}/${{ github.repository }} + IMAGE_SOURCE=${{ github.server_url }}/${{ github.repository }} ${{ matrix.config.ubuntu_version && format('UBUNTU_VERSION={0}', matrix.config.ubuntu_version) || '' }} ${{ matrix.config.cuda_version && format('CUDA_VERSION={0}', matrix.config.cuda_version) || '' }} + annotations: | + manifest:org.opencontainers.image.created=${{ steps.build_date.outputs.date }} + manifest:org.opencontainers.image.version=${{ needs.create_tag.outputs.source_tag }} + manifest:org.opencontainers.image.revision=${{ steps.checkout.outputs.commit }} + manifest:org.opencontainers.image.title=llama.cpp + manifest:org.opencontainers.image.description=LLM inference in C/C++ + manifest:org.opencontainers.image.url=${{ github.server_url }}/${{ github.repository }} + manifest:org.opencontainers.image.source=${{ github.server_url }}/${{ github.repository }} # using github experimental cache #cache-from: type=gha #cache-to: type=gha,mode=max @@ -259,13 +302,26 @@ jobs: with: context: . platforms: ${{ matrix.config.platforms }} - outputs: type=image,name=${{ steps.meta.outputs.image_repo }},push-by-digest=true,name-canonical=true,push=true + outputs: type=image,name=${{ steps.meta.outputs.image_repo }},push-by-digest=true,name-canonical=true,push=true,oci-mediatypes=true file: ${{ matrix.config.dockerfile }} target: server provenance: false build-args: | + BUILD_DATE=${{ steps.build_date.outputs.date }} + APP_VERSION=${{ needs.create_tag.outputs.source_tag }} + APP_REVISION=${{ steps.checkout.outputs.commit }} + IMAGE_URL=${{ github.server_url }}/${{ github.repository }} + IMAGE_SOURCE=${{ github.server_url }}/${{ github.repository }} ${{ matrix.config.ubuntu_version && format('UBUNTU_VERSION={0}', matrix.config.ubuntu_version) || '' }} ${{ matrix.config.cuda_version && format('CUDA_VERSION={0}', matrix.config.cuda_version) || '' }} + annotations: | + manifest:org.opencontainers.image.created=${{ steps.build_date.outputs.date }} + manifest:org.opencontainers.image.version=${{ needs.create_tag.outputs.source_tag }} + manifest:org.opencontainers.image.revision=${{ steps.checkout.outputs.commit }} + manifest:org.opencontainers.image.title=llama.cpp + manifest:org.opencontainers.image.description=LLM inference in C/C++ + manifest:org.opencontainers.image.url=${{ github.server_url }}/${{ github.repository }} + manifest:org.opencontainers.image.source=${{ github.server_url }}/${{ github.repository }} # using github experimental cache #cache-from: type=gha #cache-to: type=gha,mode=max @@ -330,10 +386,15 @@ jobs: steps: - name: Check out the repo + id: checkout uses: actions/checkout@v6 with: fetch-depth: 0 + - name: Get build date + id: build_date + run: echo "date=$(date -u +"%Y-%m-%dT%H:%M:%SZ")" >> $GITHUB_OUTPUT + - name: Download digest metadata uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8 with: @@ -361,6 +422,8 @@ jobs: IMAGE_REPO="ghcr.io/${REPO_OWNER}/${REPO_NAME}" PREFIX="${IMAGE_REPO}:" SRC_TAG="${{ needs.create_tag.outputs.source_tag }}" + BUILD_DATE="${{ steps.build_date.outputs.date }}" + COMMIT_SHA="${{ steps.checkout.outputs.commit }}" TAGS="${{ matrix.config.tag }}" ARCHES="${{ matrix.config.arches }}" DIGEST_GLOB="/tmp/digests/*.tsv" @@ -412,11 +475,21 @@ jobs: refs+=("${IMAGE_REPO}@${digest}") done + local annotations=( + --annotation "index:org.opencontainers.image.created=${BUILD_DATE}" + --annotation "index:org.opencontainers.image.version=${SRC_TAG}" + --annotation "index:org.opencontainers.image.revision=${COMMIT_SHA}" + --annotation "index:org.opencontainers.image.title=llama.cpp" + --annotation "index:org.opencontainers.image.description=LLM inference in C/C++" + --annotation "index:org.opencontainers.image.url=${{ github.server_url }}/${{ github.repository }}" + --annotation "index:org.opencontainers.image.source=${{ github.server_url }}/${{ github.repository }}" + ) + echo "Creating ${merged_tag} from ${refs[*]}" - docker buildx imagetools create --tag "${merged_tag}" "${refs[@]}" + docker buildx imagetools create "${annotations[@]}" --tag "${merged_tag}" "${refs[@]}" echo "Creating ${merged_versioned_tag} from ${refs[*]}" - docker buildx imagetools create --tag "${merged_versioned_tag}" "${refs[@]}" + docker buildx imagetools create "${annotations[@]}" --tag "${merged_versioned_tag}" "${refs[@]}" } for tag in $TAGS; do diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp index 3d1c9da83298..2f75e97ac66e 100644 --- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp +++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp @@ -2744,6 +2744,18 @@ static bool ggml_hexagon_supported_ssm_conv(const struct ggml_hexagon_session * return true; } +static bool ggml_hexagon_supported_pad(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) { + const struct ggml_tensor * src0 = op->src[0]; + const struct ggml_tensor * dst = op; + + if (src0->type != GGML_TYPE_F32 || dst->type != GGML_TYPE_F32) { + return false; + } + + GGML_UNUSED(sess); + return true; +} + static bool ggml_hexagon_supported_cumsum(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) { const struct ggml_tensor * src0 = op->src[0]; const struct ggml_tensor * dst = op; @@ -2816,6 +2828,21 @@ static bool ggml_hexagon_supported_solve_tri(const struct ggml_hexagon_session * return true; } +static bool ggml_hexagon_supported_tri(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) { + + const struct ggml_tensor * src0 = op->src[0]; + const struct ggml_tensor * dst = op; + + if (src0->type != GGML_TYPE_F32) { return false; } + if (dst->type != GGML_TYPE_F32) { return false; } + if (!ggml_are_same_shape(src0, dst)) { return false; } + if (!ggml_is_contiguous(src0) || !ggml_is_contiguous(dst)) { return false; } + + return true; + + GGML_UNUSED(sess); +} + static const char * ggml_backend_hexagon_name(ggml_backend_t backend) { auto sess = static_cast(backend->context); return sess->c_name(); @@ -2857,6 +2884,9 @@ static htp_op_code op_remap_to_htp(const ggml_tensor * t) { case GGML_OP_FILL: return HTP_OP_FILL; case GGML_OP_DIAG: return HTP_OP_DIAG; case GGML_OP_SOLVE_TRI: return HTP_OP_SOLVE_TRI; + case GGML_OP_TRI: return HTP_OP_TRI; + case GGML_OP_PAD: return HTP_OP_PAD; + case GGML_OP_UNARY: switch (ggml_get_unary_op(t)) { case GGML_UNARY_OP_SILU: return HTP_OP_UNARY_SILU; @@ -3416,6 +3446,14 @@ static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, cons supp = ggml_hexagon_supported_solve_tri(sess, op); break; + case GGML_OP_TRI: + supp = ggml_hexagon_supported_tri(sess, op); + break; + + case GGML_OP_PAD: + supp = ggml_hexagon_supported_pad(sess, op); + break; + default: break; } diff --git a/ggml/src/ggml-hexagon/htp/CMakeLists.txt b/ggml/src/ggml-hexagon/htp/CMakeLists.txt index bcadac11f951..36f923243cda 100644 --- a/ggml/src/ggml-hexagon/htp/CMakeLists.txt +++ b/ggml/src/ggml-hexagon/htp/CMakeLists.txt @@ -38,6 +38,7 @@ add_library(${HTP_LIB} SHARED diag-ops.c solve-tri-ops.c gated-delta-net-ops.c + pad-ops.c ) target_compile_definitions(${HTP_LIB} PRIVATE diff --git a/ggml/src/ggml-hexagon/htp/htp-ctx.h b/ggml/src/ggml-hexagon/htp/htp-ctx.h index 92f02eac6e31..6fe3e6c7d85a 100644 --- a/ggml/src/ggml-hexagon/htp/htp-ctx.h +++ b/ggml/src/ggml-hexagon/htp/htp-ctx.h @@ -107,5 +107,7 @@ int op_fill(struct htp_ops_context * octx); int op_diag(struct htp_ops_context * octx); int op_solve_tri(struct htp_ops_context * octx); int op_gated_delta_net(struct htp_ops_context * octx); +int op_tri(struct htp_ops_context * octx); +int op_pad(struct htp_ops_context * octx); #endif /* HTP_CTX_H */ diff --git a/ggml/src/ggml-hexagon/htp/htp-ops.h b/ggml/src/ggml-hexagon/htp/htp-ops.h index 98db864dd42a..676e948a4398 100644 --- a/ggml/src/ggml-hexagon/htp/htp-ops.h +++ b/ggml/src/ggml-hexagon/htp/htp-ops.h @@ -86,6 +86,8 @@ enum htp_op_code { HTP_OP_SOLVE_TRI, HTP_OP_L2_NORM, HTP_OP_GATED_DELTA_NET, + HTP_OP_TRI, + HTP_OP_PAD, HTP_OP_INVALID }; diff --git a/ggml/src/ggml-hexagon/htp/main.c b/ggml/src/ggml-hexagon/htp/main.c index 883a31d61634..12003c1fd8a4 100644 --- a/ggml/src/ggml-hexagon/htp/main.c +++ b/ggml/src/ggml-hexagon/htp/main.c @@ -595,9 +595,15 @@ static int execute_op(struct htp_ops_context * octx) { case HTP_OP_SOLVE_TRI: return op_solve_tri(octx); + case HTP_OP_PAD: + return op_pad(octx); + case HTP_OP_GATED_DELTA_NET: return op_gated_delta_net(octx); + case HTP_OP_TRI: + return op_tri(octx); + case HTP_OP_INVALID: break; diff --git a/ggml/src/ggml-hexagon/htp/pad-ops.c b/ggml/src/ggml-hexagon/htp/pad-ops.c new file mode 100644 index 000000000000..3abc3c2ead17 --- /dev/null +++ b/ggml/src/ggml-hexagon/htp/pad-ops.c @@ -0,0 +1,545 @@ +#pragma clang diagnostic ignored "-Wunused-variable" +#pragma clang diagnostic ignored "-Wunused-function" +#pragma clang diagnostic ignored "-Wunused-but-set-variable" + +#include +#include + +#include + +#include "hex-dma.h" +#include "hvx-utils.h" + +#define GGML_COMMON_DECL_C +#include "ggml-common.h" +#include "htp-ctx.h" +#include "htp-ops.h" + +/* Circular wrap: maps any integer x into [0, n) */ +static inline uint32_t wrap_around(int32_t x, uint32_t n) { + return (uint32_t)(((x % (int32_t)n) + (int32_t)n) % (int32_t)n); +} + +/* Decompose a flat dst row index into (i1, i2, i3) */ +static inline void pad_decompose_row(uint32_t ir, uint32_t ne1, uint32_t ne2, + uint32_t *i1, uint32_t *i2, uint32_t *i3) { + *i1 = ir % ne1; + *i2 = (ir / ne1) % ne2; + *i3 = ir / (ne1 * ne2); +} + +/* Return non-zero if row (i1,i2,i3) falls in the non-padded interior */ +static inline int pad_is_interior(uint32_t i1, uint32_t i2, uint32_t i3, + int32_t lp1, int32_t rp1, uint32_t ne1, + int32_t lp2, int32_t rp2, uint32_t ne2, + int32_t lp3, int32_t rp3, uint32_t ne3) { + return ((int32_t)i1 >= lp1 && (int32_t)i1 < (int32_t)ne1 - rp1) && + ((int32_t)i2 >= lp2 && (int32_t)i2 < (int32_t)ne2 - rp2) && + ((int32_t)i3 >= lp3 && (int32_t)i3 < (int32_t)ne3 - rp3); +} + +/* Compute the DDR src row pointer for a zero-pad interior row */ +static inline const uint8_t * pad_src_row_ptr(const struct htp_tensor * src, + uint32_t i1, uint32_t i2, uint32_t i3, + int32_t lp1, int32_t lp2, int32_t lp3) { + return (const uint8_t *) src->data + + (i1 - (uint32_t)lp1) * src->nb[1] + + (i2 - (uint32_t)lp2) * src->nb[2] + + (i3 - (uint32_t)lp3) * src->nb[3]; +} + +/* Compute the DDR src row pointer for a circular row (wrap-around indexing) */ +static inline const uint8_t * pad_circ_src_row_ptr(const struct htp_tensor * src, + uint32_t i1, uint32_t i2, uint32_t i3, + int32_t lp1, int32_t lp2, int32_t lp3) { + return (const uint8_t *) src->data + + wrap_around((int32_t)i1 - lp1, src->ne[1]) * src->nb[1] + + wrap_around((int32_t)i2 - lp2, src->ne[2]) * src->nb[2] + + wrap_around((int32_t)i3 - lp3, src->ne[3]) * src->nb[3]; +} + +struct htp_pad_context { + struct htp_ops_context * octx; + + int32_t lp0, rp0; + int32_t lp1, rp1; + int32_t lp2, rp2; + int32_t lp3, rp3; + + uint32_t nrows_per_thread; + uint32_t total_dst_rows; + + size_t type_size; + + // Row sizes for DMA kernel (populated when VTCM is available) + size_t src_row_size; + size_t src_row_size_aligned; + size_t dst_row_size; + size_t dst_row_size_aligned; +}; + +#define htp_pad_preamble \ + const struct htp_tensor * src = octx->src[0]; \ + const struct htp_tensor * dst = octx->dst; \ + \ + const uint32_t ne00 = src->ne[0]; \ + const uint32_t nb00 = src->nb[0]; \ + \ + const uint32_t ne0 = dst->ne[0]; \ + const uint32_t ne1 = dst->ne[1]; \ + const uint32_t ne2 = dst->ne[2]; \ + const uint32_t ne3 = dst->ne[3]; \ + \ + const uint32_t nb1 = dst->nb[1]; \ + const uint32_t nb2 = dst->nb[2]; \ + const uint32_t nb3 = dst->nb[3]; \ + \ + const int32_t lp0 = pctx->lp0, rp0 = pctx->rp0; \ + const int32_t lp1 = pctx->lp1, rp1 = pctx->rp1; \ + const int32_t lp2 = pctx->lp2, rp2 = pctx->rp2; \ + const int32_t lp3 = pctx->lp3, rp3 = pctx->rp3; \ + \ + const size_t type_size = pctx->type_size; \ + \ + const uint32_t row_start = pctx->nrows_per_thread * ith; \ + const uint32_t row_end = MIN(row_start + pctx->nrows_per_thread, pctx->total_dst_rows); + + +#define htp_pad_dma_preamble \ + const size_t src_row_size = pctx->src_row_size; \ + const size_t src_row_size_aligned = pctx->src_row_size_aligned; \ + const size_t dst_row_size = pctx->dst_row_size; \ + const size_t dst_row_size_aligned = pctx->dst_row_size_aligned; \ + \ + uint8_t * src_spad_base = octx->src0_spad.data + ith * octx->src0_spad.size_per_thread; \ + uint8_t * dst_spad_base = octx->dst_spad.data + ith * octx->dst_spad.size_per_thread; \ + \ + dma_queue * dma = octx->ctx->dma[ith]; + +// --------------------------------------------------------------------------- +// HVX vectorized PAD kernel +// --------------------------------------------------------------------------- + +static void pad_job_per_thread_hvx(unsigned int nth, unsigned int ith, void * data) { + const struct htp_pad_context * pctx = (const struct htp_pad_context *) data; + struct htp_ops_context * octx = pctx->octx; + htp_pad_preamble; + + uint64_t t1, t2; + t1 = HAP_perf_get_qtimer_count(); + + for (uint32_t dst_row = row_start; dst_row < row_end; dst_row++) { + uint32_t i1, i2, i3; + pad_decompose_row(dst_row, ne1, ne2, &i1, &i2, &i3); + + uint8_t * dst_ptr = (uint8_t *) dst->data + i1 * nb1 + i2 * nb2 + i3 * nb3; + + const int interior = pad_is_interior(i1, i2, i3, + lp1, rp1, ne1, + lp2, rp2, ne2, + lp3, rp3, ne3); + + if (!interior) { + hvx_splat_f32_u(dst_ptr, 0.0f, ne0); + } else { + const uint8_t * src_ptr = pad_src_row_ptr(src, i1, i2, i3, lp1, lp2, lp3); + + if (lp0 > 0) { + hvx_splat_f32_u(dst_ptr, 0.0f, (uint32_t)lp0); + } + + uint8_t * dst_row_start = dst_ptr + (size_t)lp0 * type_size; + if (nb00 == type_size) { + hvx_copy_f32_uu(dst_row_start, src_ptr, ne00); + } else { + for (uint32_t i = 0; i < ne00; i++) { + memcpy(dst_row_start + i * type_size, + src_ptr + (size_t)i * nb00, + type_size); + } + } + + if (rp0 > 0) { + hvx_splat_f32_u(dst_ptr + ((size_t)lp0 + ne00) * type_size, 0.0f, (uint32_t)rp0); + } + } + } + + t2 = HAP_perf_get_qtimer_count(); + + FARF(HIGH, "pad-hvx %d/%d: (%ux%ux%ux%u) -> (%ux%ux%ux%u) rows %u:%u usec %u\n", + ith, nth, + src->ne[0], src->ne[1], src->ne[2], src->ne[3], + dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], + row_start, row_end, + (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1)); +} + +// --------------------------------------------------------------------------- +// HVX + DMA PAD kernel — aligned, double-buffered +// --------------------------------------------------------------------------- + +static void pad_job_per_thread_hvx_dma(unsigned int nth, unsigned int ith, void * data) { + const struct htp_pad_context * pctx = (const struct htp_pad_context *) data; + struct htp_ops_context * octx = pctx->octx; + htp_pad_preamble; + htp_pad_dma_preamble; + + uint64_t t1, t2; + t1 = HAP_perf_get_qtimer_count(); + + // ----------------------------------------------------------------------- + // Priming phase: push 2 pairs of (dummy_dst_DMA, src_DMA) to seed the + // double-buffer pipeline before the main loop begins. + // ----------------------------------------------------------------------- + for (uint32_t ir = row_start, spad_idx = 0; ir < row_end && spad_idx < 2; ir++, spad_idx++) { + uint8_t * src_spad_cur = src_spad_base + spad_idx * src_row_size_aligned; + uint8_t * dst_spad_cur = dst_spad_base + spad_idx * dst_row_size_aligned; + + dma_queue_push_vtcm_to_ddr(dma, + dma_make_ptr((uint8_t *)dst->data, dst_spad_cur), + dst_row_size, dst_row_size_aligned, 0); + + uint32_t i1, i2, i3; + pad_decompose_row(ir, ne1, ne2, &i1, &i2, &i3); + const int interior = pad_is_interior(i1, i2, i3, + lp1, rp1, ne1, + lp2, rp2, ne2, + lp3, rp3, ne3); + + const uint8_t * src_ptr = interior + ? pad_src_row_ptr(src, i1, i2, i3, lp1, lp2, lp3) : NULL; + + // Interior row: real DMA (1 row) from DDR to VTCM. + // Border row: null DMA (nrows=0) + dma_queue_push_ddr_to_vtcm(dma, + dma_make_ptr(src_spad_cur, + src_ptr ? src_ptr : (const uint8_t *)src_spad_cur), + src_row_size_aligned, src_row_size, src_ptr ? 1 : 0); + } + + // ----------------------------------------------------------------------- + // Main loop: pop completed DMAs, compute in VTCM with aligned HVX ops, + // push dst DMA and prefetch src for the next+1 row. + // ----------------------------------------------------------------------- + for (uint32_t ir = row_start; ir < row_end; ir++) { + uint8_t * dst_spad_cur = (uint8_t *) dma_queue_pop(dma).src; + uint8_t * src_spad_cur = (uint8_t *) dma_queue_pop(dma).dst; + + uint32_t i1, i2, i3; + pad_decompose_row(ir, ne1, ne2, &i1, &i2, &i3); + + uint8_t * dst_ptr = (uint8_t *) dst->data + i1 * nb1 + i2 * nb2 + i3 * nb3; + + const int interior = pad_is_interior(i1, i2, i3, + lp1, rp1, ne1, + lp2, rp2, ne2, + lp3, rp3, ne3); + + if (!interior) { + hvx_splat_f32_a(dst_spad_cur, 0.0f, ne0); + } else { + hvx_splat_f32_a(dst_spad_cur, 0.0f, ne0); + + uint8_t * dst_interior = dst_spad_cur + (size_t)lp0 * type_size; + + if ((uintptr_t)dst_interior % VLEN == 0) { + hvx_copy_f32_aa(dst_interior, src_spad_cur, ne00); + } else { + hvx_copy_f32_ua(dst_interior, src_spad_cur, ne00); + } + } + + dma_queue_push_vtcm_to_ddr(dma, + dma_make_ptr(dst_ptr, dst_spad_cur), + dst_row_size, dst_row_size_aligned, 1); + + const uint32_t next_row = ir + 2; + if (next_row < row_end) { + uint32_t ni1, ni2, ni3; + pad_decompose_row(next_row, ne1, ne2, &ni1, &ni2, &ni3); + const int next_interior = pad_is_interior(ni1, ni2, ni3, + lp1, rp1, ne1, + lp2, rp2, ne2, + lp3, rp3, ne3); + const uint8_t * next_src_ptr = next_interior + ? pad_src_row_ptr(src, ni1, ni2, ni3, lp1, lp2, lp3) : NULL; + + dma_queue_push_ddr_to_vtcm(dma, + dma_make_ptr(src_spad_cur, + next_src_ptr ? next_src_ptr : (const uint8_t *)src_spad_cur), + src_row_size_aligned, src_row_size, next_src_ptr ? 1 : 0); + } + } + + dma_queue_flush(dma); + + t2 = HAP_perf_get_qtimer_count(); + + FARF(HIGH, "pad-hvx-dma %d/%d: (%ux%ux%ux%u) -> (%ux%ux%ux%u) rows %u:%u usec %u\n", + ith, nth, + src->ne[0], src->ne[1], src->ne[2], src->ne[3], + dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], + row_start, row_end, + (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1)); +} + +// --------------------------------------------------------------------------- +// HVX circular PAD kernel +// --------------------------------------------------------------------------- + +static void pad_job_per_thread_hvx_circular(unsigned int nth, unsigned int ith, void * data) { + const struct htp_pad_context * pctx = (const struct htp_pad_context *) data; + struct htp_ops_context * octx = pctx->octx; + htp_pad_preamble; + + uint64_t t1, t2; + t1 = HAP_perf_get_qtimer_count(); + + for (uint32_t dst_row = row_start; dst_row < row_end; dst_row++) { + uint32_t i1, i2, i3; + pad_decompose_row(dst_row, ne1, ne2, &i1, &i2, &i3); + + uint8_t * dst_ptr = (uint8_t *) dst->data + i1 * nb1 + i2 * nb2 + i3 * nb3; + const uint8_t * src_row = pad_circ_src_row_ptr(src, i1, i2, i3, lp1, lp2, lp3); + + if (nb00 == type_size) { + + if (lp0 > 0) { + if ((uint32_t)lp0 < 32) { + memcpy(dst_ptr, + src_row + (size_t)(ne00 - (uint32_t)lp0) * type_size, + (size_t)lp0 * type_size); + } else { + hvx_copy_f32_uu(dst_ptr, + src_row + (size_t)(ne00 - (uint32_t)lp0) * type_size, + (uint32_t)lp0); + } + } + hvx_copy_f32_uu(dst_ptr + (size_t)lp0 * type_size, src_row, ne00); + if (rp0 > 0) { + if ((uint32_t)rp0 < 32) { + memcpy(dst_ptr + ((size_t)lp0 + ne00) * type_size, + src_row, + (size_t)rp0 * type_size); + } else { + hvx_copy_f32_uu(dst_ptr + ((size_t)lp0 + ne00) * type_size, + src_row, + (uint32_t)rp0); + } + } + } else { + for (uint32_t i = 0; i < (uint32_t)lp0; i++) { + *(float *)(dst_ptr + i * type_size) = + *(const float *)(src_row + (size_t)(ne00 - (uint32_t)lp0 + i) * nb00); + } + for (uint32_t i = 0; i < ne00; i++) { + *(float *)(dst_ptr + ((size_t)lp0 + i) * type_size) = + *(const float *)(src_row + (size_t)i * nb00); + } + for (uint32_t i = 0; i < (uint32_t)rp0; i++) { + *(float *)(dst_ptr + ((size_t)lp0 + ne00 + i) * type_size) = + *(const float *)(src_row + (size_t)i * nb00); + } + } + } + + t2 = HAP_perf_get_qtimer_count(); + + FARF(HIGH, "pad-hvx-circ %d/%d: (%ux%ux%ux%u) -> (%ux%ux%ux%u) rows %u:%u usec %u\n", + ith, nth, + src->ne[0], src->ne[1], src->ne[2], src->ne[3], + dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], + row_start, row_end, + (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1)); +} + +// --------------------------------------------------------------------------- +// HVX + DMA circular PAD kernel — aligned, double-buffered +// --------------------------------------------------------------------------- + +static void pad_job_per_thread_hvx_circular_dma(unsigned int nth, unsigned int ith, void * data) { + const struct htp_pad_context * pctx = (const struct htp_pad_context *) data; + struct htp_ops_context * octx = pctx->octx; + htp_pad_preamble; + htp_pad_dma_preamble; + + uint64_t t1, t2; + t1 = HAP_perf_get_qtimer_count(); + + // ----------------------------------------------------------------------- + // Priming phase: push 2 pairs of (dummy_dst_DMA, src_DMA) to seed the + // double-buffer pipeline. Every row is a real src DMA (no null DMAs). + // ----------------------------------------------------------------------- + for (uint32_t ir = row_start, spad_idx = 0; ir < row_end && spad_idx < 2; ir++, spad_idx++) { + uint8_t * src_spad_cur = src_spad_base + spad_idx * src_row_size_aligned; + uint8_t * dst_spad_cur = dst_spad_base + spad_idx * dst_row_size_aligned; + + dma_queue_push_vtcm_to_ddr(dma, + dma_make_ptr((uint8_t *)dst->data, dst_spad_cur), + dst_row_size, dst_row_size_aligned, 0); + + uint32_t pi1, pi2, pi3; + pad_decompose_row(ir, ne1, ne2, &pi1, &pi2, &pi3); + dma_queue_push_ddr_to_vtcm(dma, + dma_make_ptr(src_spad_cur, pad_circ_src_row_ptr(src, pi1, pi2, pi3, lp1, lp2, lp3)), + src_row_size_aligned, src_row_size, 1); + } + + // ----------------------------------------------------------------------- + // Main loop: pop completed DMAs, assemble circular row in VTCM with + // aligned HVX ops, push dst DMA and prefetch src for the next+1 row. + // ----------------------------------------------------------------------- + for (uint32_t ir = row_start; ir < row_end; ir++) { + uint8_t * dst_spad_cur = (uint8_t *) dma_queue_pop(dma).src; + uint8_t * src_spad_cur = (uint8_t *) dma_queue_pop(dma).dst; + + uint32_t i1, i2, i3; + pad_decompose_row(ir, ne1, ne2, &i1, &i2, &i3); + uint8_t * dst_ptr = (uint8_t *) dst->data + i1 * nb1 + i2 * nb2 + i3 * nb3; + + + if (lp0 > 0) { + uint8_t * dst_left = dst_spad_cur; + const uint8_t * src_left = src_spad_cur + (size_t)(ne00 - (uint32_t)lp0) * type_size; + if ((uint32_t)lp0 < 32) { + memcpy(dst_left, src_left, (size_t)lp0 * type_size); + } else { + hvx_copy_f32_uu(dst_left, src_left, (uint32_t)lp0); + } + } + + { + uint8_t * dst_mid = dst_spad_cur + (size_t)lp0 * type_size; + if ((uintptr_t)dst_mid % VLEN == 0) { + hvx_copy_f32_aa(dst_mid, src_spad_cur, ne00); + } else { + hvx_copy_f32_ua(dst_mid, src_spad_cur, ne00); + } + } + + if (rp0 > 0) { + uint8_t * dst_right = dst_spad_cur + ((size_t)lp0 + ne00) * type_size; + if ((uint32_t)rp0 < 32) { + memcpy(dst_right, src_spad_cur, (size_t)rp0 * type_size); + } else { + if ((uintptr_t)dst_right % VLEN == 0) { + hvx_copy_f32_aa(dst_right, src_spad_cur, (uint32_t)rp0); + } else { + hvx_copy_f32_ua(dst_right, src_spad_cur, (uint32_t)rp0); + } + } + } + + dma_queue_push_vtcm_to_ddr(dma, + dma_make_ptr(dst_ptr, dst_spad_cur), + dst_row_size, dst_row_size_aligned, 1); + + const uint32_t next_row = ir + 2; + if (next_row < row_end) { + uint32_t nri1, nri2, nri3; + pad_decompose_row(next_row, ne1, ne2, &nri1, &nri2, &nri3); + dma_queue_push_ddr_to_vtcm(dma, + dma_make_ptr(src_spad_cur, + pad_circ_src_row_ptr(src, nri1, nri2, nri3, lp1, lp2, lp3)), + src_row_size_aligned, src_row_size, 1); + } + } + + dma_queue_flush(dma); + + t2 = HAP_perf_get_qtimer_count(); + + FARF(HIGH, "pad-hvx-circ-dma %d/%d: (%ux%ux%ux%u) -> (%ux%ux%ux%u) rows %u:%u usec %u\n", + ith, nth, + src->ne[0], src->ne[1], src->ne[2], src->ne[3], + dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], + row_start, row_end, + (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1)); +} + +int op_pad(struct htp_ops_context * octx) { + const struct htp_tensor * src0 = octx->src[0]; + const struct htp_tensor * dst = octx->dst; + + // Only F32 supported + size_t type_size; + switch (src0->type) { + case HTP_TYPE_F32: type_size = 4; break; + default: + FARF(ERROR, "pad-hvx: unsupported type %u\n", src0->type); + return HTP_STATUS_NO_SUPPORT; + } + + if (octx->flags & HTP_OPFLAGS_SKIP_COMPUTE) { + return HTP_STATUS_OK; + } + + const int32_t lp0 = octx->op_params[0]; + const int32_t rp0 = octx->op_params[1]; + const int32_t lp1 = octx->op_params[2]; + const int32_t rp1 = octx->op_params[3]; + const int32_t lp2 = octx->op_params[4]; + const int32_t rp2 = octx->op_params[5]; + const int32_t lp3 = octx->op_params[6]; + const int32_t rp3 = octx->op_params[7]; + const int32_t circular = octx->op_params[8]; + + const uint32_t ne0 = dst->ne[0]; + const uint32_t ne00 = src0->ne[0]; + + const uint32_t total_dst_rows = dst->ne[1] * dst->ne[2] * dst->ne[3]; + const uint32_t n_threads = MIN(octx->n_threads, total_dst_rows > 0 ? total_dst_rows : 1); + + const size_t src_row_size = (size_t)ne00 * type_size; + const size_t dst_row_size = (size_t)ne0 * type_size; + const size_t src_row_size_aligned = hex_round_up(src_row_size, VLEN); + const size_t dst_row_size_aligned = hex_round_up(dst_row_size, VLEN); + + // Total VTCM needed: 2 buffers (ping+pong) for src and dst, per thread + const size_t vtcm_needed = (size_t)n_threads * 2 * (src_row_size_aligned + dst_row_size_aligned); + + const int use_dma = (src0->nb[0] == (uint32_t)type_size) && + (ne00 >= 512) && + (octx->ctx->vtcm_base != NULL) && + (octx->ctx->vtcm_size >= vtcm_needed); + + if (use_dma) { + octx->src0_spad.size_per_thread = 2 * src_row_size_aligned; + octx->dst_spad.size_per_thread = 2 * dst_row_size_aligned; + octx->src0_spad.size = n_threads * octx->src0_spad.size_per_thread; + octx->dst_spad.size = n_threads * octx->dst_spad.size_per_thread; + octx->src0_spad.data = octx->ctx->vtcm_base; + octx->dst_spad.data = octx->src0_spad.data + octx->src0_spad.size; + } + + struct htp_pad_context pctx = { + .octx = octx, + .lp0 = lp0, .rp0 = rp0, + .lp1 = lp1, .rp1 = rp1, + .lp2 = lp2, .rp2 = rp2, + .lp3 = lp3, .rp3 = rp3, + .nrows_per_thread = (total_dst_rows + n_threads - 1) / n_threads, + .total_dst_rows = total_dst_rows, + .type_size = type_size, + .src_row_size = src_row_size, + .src_row_size_aligned = src_row_size_aligned, + .dst_row_size = dst_row_size, + .dst_row_size_aligned = dst_row_size_aligned, + }; + + FARF(HIGH, "pad-hvx%s%s: (%ux%ux%ux%u) -> (%ux%ux%ux%u) pads=(%d,%d,%d,%d,%d,%d,%d,%d)\n", + circular ? "-circ" : "", + use_dma ? "-dma" : "", + src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], + dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], + lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3); + + if (circular && use_dma) { worker_pool_run_func(octx->ctx->worker_pool, pad_job_per_thread_hvx_circular_dma, &pctx, n_threads); } + else if (circular) { worker_pool_run_func(octx->ctx->worker_pool, pad_job_per_thread_hvx_circular, &pctx, n_threads); } + else if (use_dma) { worker_pool_run_func(octx->ctx->worker_pool, pad_job_per_thread_hvx_dma, &pctx, n_threads); } + else { worker_pool_run_func(octx->ctx->worker_pool, pad_job_per_thread_hvx, &pctx, n_threads); } + + return HTP_STATUS_OK; +} + diff --git a/ggml/src/ggml-hexagon/htp/unary-ops.c b/ggml/src/ggml-hexagon/htp/unary-ops.c index d4ae89ee6f04..1ce881353ec9 100644 --- a/ggml/src/ggml-hexagon/htp/unary-ops.c +++ b/ggml/src/ggml-hexagon/htp/unary-ops.c @@ -17,7 +17,6 @@ #include "ggml-common.h" #include "htp-ctx.h" #include "htp-ops.h" -#include "htp-ops.h" struct htp_unary_context { struct htp_ops_context * octx; @@ -277,6 +276,95 @@ static void sigmoid_f32(const float * restrict src, } } +static void tri_f32(const float * restrict src, + float * restrict dst, + uint8_t * restrict spad, + const uint32_t num_rows, + const uint32_t row_elems, + const size_t row_size, + int32_t * op_params, + const uint32_t ir, + const struct htp_unary_context * uctx) { + + const int32_t ttype = op_params[0]; + const HVX_Vector zero = hvx_vec_splat_f32(0.0f); + const uint32_t nvec = row_elems / VLEN_FP32; + const uint32_t nloe = row_elems % VLEN_FP32; + + const uint32_t ne01 = uctx->octx->src[0]->ne[1]; + + for (uint32_t b = 0; b < num_rows; b++) { + const uint32_t abs_row = ir + b; + const uint32_t i01 = abs_row % ne01; + + const HVX_Vector * restrict v_src = (const HVX_Vector *) ((const uint8_t *) src + b * row_size); + HVX_Vector * restrict v_dst = (HVX_Vector *) ((uint8_t *) dst + b * row_size); + + uint32_t boundary; + int keep_left; + switch (ttype) { + case 0: boundary = i01; keep_left = 0; break; // keep col >= row + case 1: boundary = i01 + 1; keep_left = 0; break; // keep col > row + case 2: boundary = i01 + 1; keep_left = 1; break; // keep col <= row + case 3: boundary = i01; keep_left = 1; break; // keep col < row + default: boundary = 0; keep_left = 0; break; + } + if (boundary > row_elems) boundary = row_elems; + + // Full HVX vectors — each starts at a 128-byte aligned offset + for (uint32_t i = 0; i < nvec; i++) { + const uint32_t vec_start = i * VLEN_FP32; + const uint32_t vec_end = vec_start + VLEN_FP32; + if (keep_left) { + if (vec_end <= boundary) { + v_dst[i] = v_src[i]; + } else if (vec_start >= boundary) { + v_dst[i] = zero; + } else { + HVX_VectorPred mask = Q6_Q_vsetq_R((boundary - vec_start) * sizeof(float)); + v_dst[i] = Q6_V_vmux_QVV(mask, v_src[i], zero); + } + } else { + if (vec_end <= boundary) { + v_dst[i] = zero; + } else if (vec_start >= boundary) { + v_dst[i] = v_src[i]; + } else { + HVX_VectorPred mask = Q6_Q_vsetq_R((boundary - vec_start) * sizeof(float)); + v_dst[i] = Q6_V_vmux_QVV(mask, zero, v_src[i]); + } + } + } + + // Tail elements (row_elems not a multiple of VLEN_FP32) + if (nloe > 0) { + const uint32_t vec_start = nvec * VLEN_FP32; + const uint32_t vec_end = vec_start + nloe; + HVX_Vector tail_val; + if (keep_left) { + if (vec_end <= boundary) { + tail_val = v_src[nvec]; + } else if (vec_start >= boundary) { + tail_val = zero; + } else { + HVX_VectorPred mask = Q6_Q_vsetq_R((boundary - vec_start) * sizeof(float)); + tail_val = Q6_V_vmux_QVV(mask, v_src[nvec], zero); + } + } else { + if (vec_end <= boundary) { + tail_val = zero; + } else if (vec_start >= boundary) { + tail_val = v_src[nvec]; + } else { + HVX_VectorPred mask = Q6_Q_vsetq_R((boundary - vec_start) * sizeof(float)); + tail_val = Q6_V_vmux_QVV(mask, zero, v_src[nvec]); + } + } + hvx_vec_store_a(&v_dst[nvec], nloe * sizeof(float), tail_val); + } + } +} + static void softplus_f32(const float * restrict src, float * restrict dst, uint8_t * restrict spad, @@ -498,6 +586,9 @@ static void unary_job_f32_per_thread(unsigned int nth, unsigned int ith, void * case HTP_OP_L2_NORM: l2_norm_f32(src0_spad, dst_spad, NULL, block_size, ne0, src0_row_size_aligned, op_params); break; + case HTP_OP_TRI: + tri_f32(src0_spad, dst_spad, NULL, block_size, ne00, src0_row_size_aligned, op_params, ir, uctx); + break; default: break; } @@ -571,6 +662,10 @@ static int execute_op_unary_f32(struct htp_ops_context * octx) { case HTP_OP_L2_NORM: op_type = "l2norm-f32"; break; + case HTP_OP_TRI: + op_type = "tri-f32"; + break; + default: FARF(ERROR, "Unsupported unary Op %u\n", octx->op); return HTP_STATUS_NO_SUPPORT; @@ -640,6 +735,22 @@ static int execute_op_unary_f32(struct htp_ops_context * octx) { return err; } +int op_tri(struct htp_ops_context * octx) { + int err = HTP_STATUS_OK; + + switch (octx->src[0]->type) { + case HTP_TYPE_F32: + err = execute_op_unary_f32(octx); + break; + + default: + err = HTP_STATUS_NO_SUPPORT; + break; + } + + return err; +} + int op_unary(struct htp_ops_context * octx) { int err = HTP_STATUS_OK;