Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .devops/cann.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ RUN mkdir -p /app/lib && \
RUN mkdir -p /app/full && \
cp build/bin/* /app/full/ && \
cp *.py /app/full/ && \
cp -r conversion /app/full/ && \
cp -r gguf-py /app/full/ && \
cp -r requirements /app/full/ && \
cp requirements.txt /app/full/
Expand Down
1 change: 1 addition & 0 deletions .devops/cpu.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ RUN mkdir -p /app/lib && \
RUN mkdir -p /app/full \
&& cp build/bin/* /app/full \
&& cp *.py /app/full \
&& cp -r conversion /app/full \
&& cp -r gguf-py /app/full \
&& cp -r requirements /app/full \
&& cp requirements.txt /app/full \
Expand Down
1 change: 1 addition & 0 deletions .devops/cuda.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ RUN mkdir -p /app/lib && \
RUN mkdir -p /app/full \
&& cp build/bin/* /app/full \
&& cp *.py /app/full \
&& cp -r conversion /app/full \
&& cp -r gguf-py /app/full \
&& cp -r requirements /app/full \
&& cp requirements.txt /app/full \
Expand Down
1 change: 1 addition & 0 deletions .devops/intel.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ RUN mkdir -p /app/lib && \
RUN mkdir -p /app/full \
&& cp build/bin/* /app/full \
&& cp *.py /app/full \
&& cp -r conversion /app/full \
&& cp -r gguf-py /app/full \
&& cp -r requirements /app/full \
&& cp requirements.txt /app/full \
Expand Down
1 change: 1 addition & 0 deletions .devops/musa.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ RUN mkdir -p /app/lib && \
RUN mkdir -p /app/full \
&& cp build/bin/* /app/full \
&& cp *.py /app/full \
&& cp -r conversion /app/full \
&& cp -r gguf-py /app/full \
&& cp -r requirements /app/full \
&& cp requirements.txt /app/full \
Expand Down
1 change: 1 addition & 0 deletions .devops/openvino.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@ RUN mkdir -p /app/lib && \
RUN mkdir -p /app/full \
&& cp build/ReleaseOV/bin/* /app/full/ \
&& cp *.py /app/full \
&& cp -r conversion /app/full \
&& cp -r gguf-py /app/full \
&& cp -r requirements /app/full \
&& cp requirements.txt /app/full \
Expand Down
1 change: 1 addition & 0 deletions .devops/rocm.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ RUN mkdir -p /app/lib \
RUN mkdir -p /app/full \
&& cp build/bin/* /app/full \
&& cp *.py /app/full \
&& cp -r conversion /app/full \
&& cp -r gguf-py /app/full \
&& cp -r requirements /app/full \
&& cp requirements.txt /app/full \
Expand Down
9 changes: 6 additions & 3 deletions .devops/s390x.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ RUN --mount=type=cache,target=/root/.ccache \

COPY *.py /opt/llama.cpp/bin
COPY .devops/tools.sh /opt/llama.cpp/bin
COPY conversion /opt/llama.cpp/conversion

COPY gguf-py /opt/llama.cpp/gguf-py
COPY requirements.txt /opt/llama.cpp/gguf-py
Expand All @@ -47,9 +48,10 @@ COPY requirements /opt/llama.cpp/gguf-py/requirements
FROM scratch AS collector

# Copy llama.cpp binaries and libraries
COPY --from=build /opt/llama.cpp/bin /llama.cpp/bin
COPY --from=build /opt/llama.cpp/lib /llama.cpp/lib
COPY --from=build /opt/llama.cpp/gguf-py /llama.cpp/gguf-py
COPY --from=build /opt/llama.cpp/bin /llama.cpp/bin
COPY --from=build /opt/llama.cpp/lib /llama.cpp/lib
COPY --from=build /opt/llama.cpp/gguf-py /llama.cpp/gguf-py
COPY --from=build /opt/llama.cpp/conversion /llama.cpp/conversion


### Base image
Expand Down Expand Up @@ -107,6 +109,7 @@ RUN curl https://sh.rustup.rs -sSf | bash -s -- -y

COPY --from=collector /llama.cpp/bin /app
COPY --from=collector /llama.cpp/gguf-py /app/gguf-py
COPY --from=collector /llama.cpp/conversion /app/conversion

RUN pip install --no-cache-dir --break-system-packages \
-r /app/gguf-py/requirements.txt
Expand Down
1 change: 1 addition & 0 deletions .devops/vulkan.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ RUN mkdir -p /app/lib && \
RUN mkdir -p /app/full \
&& cp build/bin/* /app/full \
&& cp *.py /app/full \
&& cp -r conversion /app/full \
&& cp -r gguf-py /app/full \
&& cp -r requirements /app/full \
&& cp requirements.txt /app/full \
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/build-and-test-snapdragon.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ jobs:
android-ndk-snapdragon:
runs-on: ubuntu-latest
container:
image: 'ghcr.io/snapdragon-toolchain/arm64-android:v0.3'
image: 'ghcr.io/snapdragon-toolchain/arm64-android:v0.6'
defaults:
run:
shell: bash
Expand Down Expand Up @@ -61,7 +61,7 @@ jobs:
linux-iot-snapdragon:
runs-on: ubuntu-latest
container:
image: 'ghcr.io/snapdragon-toolchain/arm64-linux:v0.1'
image: 'ghcr.io/snapdragon-toolchain/arm64-linux:v0.6'
defaults:
run:
shell: bash
Expand Down
8 changes: 4 additions & 4 deletions docs/backend/snapdragon/CMakeUserPresets.json
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,8 @@
"ANDROID_ABI": "arm64-v8a",
"ANDROID_PLATFORM": "android-31",
"CMAKE_TOOLCHAIN_FILE": "$env{ANDROID_NDK_ROOT}/build/cmake/android.toolchain.cmake",
"CMAKE_C_FLAGS": "-march=armv8.7a+fp16 -fvectorize -ffp-model=fast -fno-finite-math-only -flto -D_GNU_SOURCE",
"CMAKE_CXX_FLAGS": "-march=armv8.7a+fp16 -fvectorize -ffp-model=fast -fno-finite-math-only -flto -D_GNU_SOURCE",
"CMAKE_C_FLAGS": "-march=armv8.7a+fp16+dotprod+i8mm -fvectorize -ffp-model=fast -fno-finite-math-only -flto -D_GNU_SOURCE",
"CMAKE_CXX_FLAGS": "-march=armv8.7a+fp16+dotprod+i8mm -fvectorize -ffp-model=fast -fno-finite-math-only -flto -D_GNU_SOURCE",
"CMAKE_C_FLAGS_RELEASE": "-O3 -DNDEBUG",
"CMAKE_CXX_FLAGS_RELEASE": "-O3 -DNDEBUG",
"CMAKE_C_FLAGS_RELWITHDEBINFO": "-O3 -DNDEBUG -g",
Expand Down Expand Up @@ -59,8 +59,8 @@
"toolset": { "value": "host=x86_64", "strategy": "external" },
"cacheVariables": {
"CMAKE_TOOLCHAIN_FILE": "cmake/arm64-linux-clang.cmake",
"CMAKE_C_FLAGS": "-march=armv8 -fno-finite-math-only -flto -D_GNU_SOURCE",
"CMAKE_CXX_FLAGS": "-march=armv8 -fno-finite-math-only -flto -D_GNU_SOURCE",
"CMAKE_C_FLAGS": "-march=armv8.2a+fp16+dotprod -fvectorize -fno-finite-math-only -flto -D_GNU_SOURCE",
"CMAKE_CXX_FLAGS": "-march=armv8.2a+fp16+dotprod -fvectorize -fno-finite-math-only -flto -D_GNU_SOURCE",
"CMAKE_C_FLAGS_RELEASE": "-O3 -DNDEBUG",
"CMAKE_CXX_FLAGS_RELEASE": "-O3 -DNDEBUG",
"CMAKE_C_FLAGS_RELWITHDEBINFO": "-O3 -DNDEBUG -g",
Expand Down
2 changes: 1 addition & 1 deletion docs/backend/snapdragon/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ This image includes Android NDK, OpenCL SDK, Hexagon SDK, CMake, etc.
This method works on Linux, macOS, and Windows. macOS and Windows users should install Docker Desktop.

```
~/src/llama.cpp$ docker run -it -u $(id -u):$(id -g) --volume $(pwd):/workspace --platform linux/amd64 ghcr.io/snapdragon-toolchain/arm64-android:v0.3
~/src/llama.cpp$ docker run -it -u $(id -u):$(id -g) --volume $(pwd):/workspace --platform linux/amd64 ghcr.io/snapdragon-toolchain/arm64-android:v0.6
[d]/> cd /workspace
```

Expand Down
8 changes: 7 additions & 1 deletion ggml/src/ggml-metal/ggml-metal-device.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1897,7 +1897,11 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_pad(ggml_metal_l
char base[256];
char name[256];

snprintf(base, 256, "kernel_pad_%s", ggml_type_name(op->src[0]->type));
// note: this is slower
//const bool is_c4 = op->src[0]->ne[0] % 4 == 0 && op->ne[0] % 4 == 0;
const bool is_c4 = false;

snprintf(base, 256, "kernel_pad_%s%s", ggml_type_name(op->src[0]->type), is_c4 ? "_4" : "");
snprintf(name, 256, "%s", base);

ggml_metal_pipeline_with_params res = ggml_metal_library_get_pipeline(lib, name);
Expand All @@ -1907,6 +1911,8 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_pad(ggml_metal_l

res = ggml_metal_library_compile_pipeline(lib, base, name, nullptr);

res.c4 = is_c4;

return res;
}

Expand Down
17 changes: 11 additions & 6 deletions ggml/src/ggml-metal/ggml-metal-ops.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -816,9 +816,7 @@ int ggml_metal_op_unary(ggml_metal_op_t ctx, int idx) {
ggml_metal_encoder_dispatch_threadgroups(enc, n, 1, 1, 1, 1, 1);
} else {
const int nth_max = MIN(256, ggml_metal_pipeline_max_theads_per_threadgroup(pipeline));

const int nth = MIN(args.ne00, nth_max);

const int nk0 = (args.ne00 + nth - 1)/nth;

ggml_metal_encoder_dispatch_threadgroups(enc, nk0*ne01, ne02, ne03, nth, 1, 1);
Expand Down Expand Up @@ -1863,7 +1861,7 @@ int ggml_metal_op_cpy(ggml_metal_op_t ctx, int idx) {
nk0 = ne00/ggml_blck_size(op->type);
}

int nth = std::min<int>(nk0, ggml_metal_pipeline_max_theads_per_threadgroup(pipeline));
int nth = std::min<int>(nk0*ne01, 256);

// when rows are small, we can batch them together in a single threadgroup
int nrptg = 1;
Expand All @@ -1874,7 +1872,7 @@ int ggml_metal_op_cpy(ggml_metal_op_t ctx, int idx) {
nrptg = (nth + nk0 - 1)/nk0;
nth = nk0;

if (nrptg*nth > ggml_metal_pipeline_max_theads_per_threadgroup(pipeline)) {
if (nrptg*nth > 256) {
nrptg--;
}
}
Expand Down Expand Up @@ -4039,14 +4037,21 @@ int ggml_metal_op_pad(ggml_metal_op_t ctx, int idx) {

auto pipeline = ggml_metal_library_get_pipeline_pad(lib, op);

const int nth = std::min(1024, ne0);
if (pipeline.c4) {
args.ne00 = ne00/4;
args.ne0 = ne0/4;
}

const int nth_max = MIN(64, ggml_metal_pipeline_max_theads_per_threadgroup(pipeline));
const int nth = MIN(args.ne0, nth_max);
const int nk0 = (args.ne0 + 1024 - 1)/1024; // note: 1024 is hardcoded in the kernel!

ggml_metal_encoder_set_pipeline(enc, pipeline);
ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[0]), 1);
ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op), 2);

ggml_metal_encoder_dispatch_threadgroups(enc, ne1, ne2, ne3, nth, 1, 1);
ggml_metal_encoder_dispatch_threadgroups(enc, nk0*ne1, ne2, ne3, nth, 1, 1);

return 1;
}
Expand Down
Loading
Loading