diff --git a/.devops/cann.Dockerfile b/.devops/cann.Dockerfile index acd1e26bcec..9df86d0489b 100644 --- a/.devops/cann.Dockerfile +++ b/.devops/cann.Dockerfile @@ -58,6 +58,7 @@ RUN mkdir -p /app/lib && \ RUN mkdir -p /app/full && \ cp build/bin/* /app/full/ && \ cp *.py /app/full/ && \ + cp -r conversion /app/full/ && \ cp -r gguf-py /app/full/ && \ cp -r requirements /app/full/ && \ cp requirements.txt /app/full/ diff --git a/.devops/cpu.Dockerfile b/.devops/cpu.Dockerfile index c8f32235d16..c19b7038bbe 100644 --- a/.devops/cpu.Dockerfile +++ b/.devops/cpu.Dockerfile @@ -30,6 +30,7 @@ RUN mkdir -p /app/lib && \ RUN mkdir -p /app/full \ && cp build/bin/* /app/full \ && cp *.py /app/full \ + && cp -r conversion /app/full \ && cp -r gguf-py /app/full \ && cp -r requirements /app/full \ && cp requirements.txt /app/full \ diff --git a/.devops/cuda.Dockerfile b/.devops/cuda.Dockerfile index 3805ea3a009..621fe8b6a97 100644 --- a/.devops/cuda.Dockerfile +++ b/.devops/cuda.Dockerfile @@ -36,6 +36,7 @@ RUN mkdir -p /app/lib && \ RUN mkdir -p /app/full \ && cp build/bin/* /app/full \ && cp *.py /app/full \ + && cp -r conversion /app/full \ && cp -r gguf-py /app/full \ && cp -r requirements /app/full \ && cp requirements.txt /app/full \ diff --git a/.devops/intel.Dockerfile b/.devops/intel.Dockerfile index 218418b80b6..b127c5cec46 100644 --- a/.devops/intel.Dockerfile +++ b/.devops/intel.Dockerfile @@ -36,6 +36,7 @@ RUN mkdir -p /app/lib && \ RUN mkdir -p /app/full \ && cp build/bin/* /app/full \ && cp *.py /app/full \ + && cp -r conversion /app/full \ && cp -r gguf-py /app/full \ && cp -r requirements /app/full \ && cp requirements.txt /app/full \ diff --git a/.devops/musa.Dockerfile b/.devops/musa.Dockerfile index a7f70b5f0df..3194294b36d 100644 --- a/.devops/musa.Dockerfile +++ b/.devops/musa.Dockerfile @@ -41,6 +41,7 @@ RUN mkdir -p /app/lib && \ RUN mkdir -p /app/full \ && cp build/bin/* /app/full \ && cp *.py /app/full \ + && cp -r conversion /app/full \ && cp -r gguf-py /app/full \ && cp -r requirements /app/full \ && cp requirements.txt /app/full \ diff --git a/.devops/openvino.Dockerfile b/.devops/openvino.Dockerfile index 1266713f378..6dabdb323ca 100644 --- a/.devops/openvino.Dockerfile +++ b/.devops/openvino.Dockerfile @@ -81,6 +81,7 @@ RUN mkdir -p /app/lib && \ RUN mkdir -p /app/full \ && cp build/ReleaseOV/bin/* /app/full/ \ && cp *.py /app/full \ + && cp -r conversion /app/full \ && cp -r gguf-py /app/full \ && cp -r requirements /app/full \ && cp requirements.txt /app/full \ diff --git a/.devops/rocm.Dockerfile b/.devops/rocm.Dockerfile index 2da15975d13..3fdf7a8e487 100644 --- a/.devops/rocm.Dockerfile +++ b/.devops/rocm.Dockerfile @@ -53,6 +53,7 @@ RUN mkdir -p /app/lib \ RUN mkdir -p /app/full \ && cp build/bin/* /app/full \ && cp *.py /app/full \ + && cp -r conversion /app/full \ && cp -r gguf-py /app/full \ && cp -r requirements /app/full \ && cp requirements.txt /app/full \ diff --git a/.devops/s390x.Dockerfile b/.devops/s390x.Dockerfile index d36f5f3ccc5..31c2fa902d4 100644 --- a/.devops/s390x.Dockerfile +++ b/.devops/s390x.Dockerfile @@ -37,6 +37,7 @@ RUN --mount=type=cache,target=/root/.ccache \ COPY *.py /opt/llama.cpp/bin COPY .devops/tools.sh /opt/llama.cpp/bin +COPY conversion /opt/llama.cpp/conversion COPY gguf-py /opt/llama.cpp/gguf-py COPY requirements.txt /opt/llama.cpp/gguf-py @@ -47,9 +48,10 @@ COPY requirements /opt/llama.cpp/gguf-py/requirements FROM scratch AS collector # Copy llama.cpp binaries and libraries -COPY --from=build /opt/llama.cpp/bin /llama.cpp/bin -COPY --from=build /opt/llama.cpp/lib /llama.cpp/lib -COPY --from=build /opt/llama.cpp/gguf-py /llama.cpp/gguf-py +COPY --from=build /opt/llama.cpp/bin /llama.cpp/bin +COPY --from=build /opt/llama.cpp/lib /llama.cpp/lib +COPY --from=build /opt/llama.cpp/gguf-py /llama.cpp/gguf-py +COPY --from=build /opt/llama.cpp/conversion /llama.cpp/conversion ### Base image @@ -107,6 +109,7 @@ RUN curl https://sh.rustup.rs -sSf | bash -s -- -y COPY --from=collector /llama.cpp/bin /app COPY --from=collector /llama.cpp/gguf-py /app/gguf-py +COPY --from=collector /llama.cpp/conversion /app/conversion RUN pip install --no-cache-dir --break-system-packages \ -r /app/gguf-py/requirements.txt diff --git a/.devops/vulkan.Dockerfile b/.devops/vulkan.Dockerfile index 464ccfef1ce..138a50d7da5 100644 --- a/.devops/vulkan.Dockerfile +++ b/.devops/vulkan.Dockerfile @@ -26,6 +26,7 @@ RUN mkdir -p /app/lib && \ RUN mkdir -p /app/full \ && cp build/bin/* /app/full \ && cp *.py /app/full \ + && cp -r conversion /app/full \ && cp -r gguf-py /app/full \ && cp -r requirements /app/full \ && cp requirements.txt /app/full \ diff --git a/.github/workflows/build-and-test-snapdragon.yml b/.github/workflows/build-and-test-snapdragon.yml index ef3fe502fa7..84613b4c830 100644 --- a/.github/workflows/build-and-test-snapdragon.yml +++ b/.github/workflows/build-and-test-snapdragon.yml @@ -31,7 +31,7 @@ jobs: android-ndk-snapdragon: runs-on: ubuntu-latest container: - image: 'ghcr.io/snapdragon-toolchain/arm64-android:v0.3' + image: 'ghcr.io/snapdragon-toolchain/arm64-android:v0.6' defaults: run: shell: bash @@ -61,7 +61,7 @@ jobs: linux-iot-snapdragon: runs-on: ubuntu-latest container: - image: 'ghcr.io/snapdragon-toolchain/arm64-linux:v0.1' + image: 'ghcr.io/snapdragon-toolchain/arm64-linux:v0.6' defaults: run: shell: bash diff --git a/docs/backend/snapdragon/CMakeUserPresets.json b/docs/backend/snapdragon/CMakeUserPresets.json index c07bf5ca0c6..d2629fc4de9 100644 --- a/docs/backend/snapdragon/CMakeUserPresets.json +++ b/docs/backend/snapdragon/CMakeUserPresets.json @@ -10,8 +10,8 @@ "ANDROID_ABI": "arm64-v8a", "ANDROID_PLATFORM": "android-31", "CMAKE_TOOLCHAIN_FILE": "$env{ANDROID_NDK_ROOT}/build/cmake/android.toolchain.cmake", - "CMAKE_C_FLAGS": "-march=armv8.7a+fp16 -fvectorize -ffp-model=fast -fno-finite-math-only -flto -D_GNU_SOURCE", - "CMAKE_CXX_FLAGS": "-march=armv8.7a+fp16 -fvectorize -ffp-model=fast -fno-finite-math-only -flto -D_GNU_SOURCE", + "CMAKE_C_FLAGS": "-march=armv8.7a+fp16+dotprod+i8mm -fvectorize -ffp-model=fast -fno-finite-math-only -flto -D_GNU_SOURCE", + "CMAKE_CXX_FLAGS": "-march=armv8.7a+fp16+dotprod+i8mm -fvectorize -ffp-model=fast -fno-finite-math-only -flto -D_GNU_SOURCE", "CMAKE_C_FLAGS_RELEASE": "-O3 -DNDEBUG", "CMAKE_CXX_FLAGS_RELEASE": "-O3 -DNDEBUG", "CMAKE_C_FLAGS_RELWITHDEBINFO": "-O3 -DNDEBUG -g", @@ -59,8 +59,8 @@ "toolset": { "value": "host=x86_64", "strategy": "external" }, "cacheVariables": { "CMAKE_TOOLCHAIN_FILE": "cmake/arm64-linux-clang.cmake", - "CMAKE_C_FLAGS": "-march=armv8 -fno-finite-math-only -flto -D_GNU_SOURCE", - "CMAKE_CXX_FLAGS": "-march=armv8 -fno-finite-math-only -flto -D_GNU_SOURCE", + "CMAKE_C_FLAGS": "-march=armv8.2a+fp16+dotprod -fvectorize -fno-finite-math-only -flto -D_GNU_SOURCE", + "CMAKE_CXX_FLAGS": "-march=armv8.2a+fp16+dotprod -fvectorize -fno-finite-math-only -flto -D_GNU_SOURCE", "CMAKE_C_FLAGS_RELEASE": "-O3 -DNDEBUG", "CMAKE_CXX_FLAGS_RELEASE": "-O3 -DNDEBUG", "CMAKE_C_FLAGS_RELWITHDEBINFO": "-O3 -DNDEBUG -g", diff --git a/docs/backend/snapdragon/README.md b/docs/backend/snapdragon/README.md index 2414eeaf6a4..f5bb3d11c48 100644 --- a/docs/backend/snapdragon/README.md +++ b/docs/backend/snapdragon/README.md @@ -10,7 +10,7 @@ This image includes Android NDK, OpenCL SDK, Hexagon SDK, CMake, etc. This method works on Linux, macOS, and Windows. macOS and Windows users should install Docker Desktop. ``` -~/src/llama.cpp$ docker run -it -u $(id -u):$(id -g) --volume $(pwd):/workspace --platform linux/amd64 ghcr.io/snapdragon-toolchain/arm64-android:v0.3 +~/src/llama.cpp$ docker run -it -u $(id -u):$(id -g) --volume $(pwd):/workspace --platform linux/amd64 ghcr.io/snapdragon-toolchain/arm64-android:v0.6 [d]/> cd /workspace ``` diff --git a/ggml/src/ggml-metal/ggml-metal-device.cpp b/ggml/src/ggml-metal/ggml-metal-device.cpp index e288a27f992..ba006d9b31a 100644 --- a/ggml/src/ggml-metal/ggml-metal-device.cpp +++ b/ggml/src/ggml-metal/ggml-metal-device.cpp @@ -1897,7 +1897,11 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_pad(ggml_metal_l char base[256]; char name[256]; - snprintf(base, 256, "kernel_pad_%s", ggml_type_name(op->src[0]->type)); + // note: this is slower + //const bool is_c4 = op->src[0]->ne[0] % 4 == 0 && op->ne[0] % 4 == 0; + const bool is_c4 = false; + + snprintf(base, 256, "kernel_pad_%s%s", ggml_type_name(op->src[0]->type), is_c4 ? "_4" : ""); snprintf(name, 256, "%s", base); ggml_metal_pipeline_with_params res = ggml_metal_library_get_pipeline(lib, name); @@ -1907,6 +1911,8 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_pad(ggml_metal_l res = ggml_metal_library_compile_pipeline(lib, base, name, nullptr); + res.c4 = is_c4; + return res; } diff --git a/ggml/src/ggml-metal/ggml-metal-ops.cpp b/ggml/src/ggml-metal/ggml-metal-ops.cpp index a114391c2e8..8506000b6c0 100644 --- a/ggml/src/ggml-metal/ggml-metal-ops.cpp +++ b/ggml/src/ggml-metal/ggml-metal-ops.cpp @@ -816,9 +816,7 @@ int ggml_metal_op_unary(ggml_metal_op_t ctx, int idx) { ggml_metal_encoder_dispatch_threadgroups(enc, n, 1, 1, 1, 1, 1); } else { const int nth_max = MIN(256, ggml_metal_pipeline_max_theads_per_threadgroup(pipeline)); - const int nth = MIN(args.ne00, nth_max); - const int nk0 = (args.ne00 + nth - 1)/nth; ggml_metal_encoder_dispatch_threadgroups(enc, nk0*ne01, ne02, ne03, nth, 1, 1); @@ -1863,7 +1861,7 @@ int ggml_metal_op_cpy(ggml_metal_op_t ctx, int idx) { nk0 = ne00/ggml_blck_size(op->type); } - int nth = std::min(nk0, ggml_metal_pipeline_max_theads_per_threadgroup(pipeline)); + int nth = std::min(nk0*ne01, 256); // when rows are small, we can batch them together in a single threadgroup int nrptg = 1; @@ -1874,7 +1872,7 @@ int ggml_metal_op_cpy(ggml_metal_op_t ctx, int idx) { nrptg = (nth + nk0 - 1)/nk0; nth = nk0; - if (nrptg*nth > ggml_metal_pipeline_max_theads_per_threadgroup(pipeline)) { + if (nrptg*nth > 256) { nrptg--; } } @@ -4039,14 +4037,21 @@ int ggml_metal_op_pad(ggml_metal_op_t ctx, int idx) { auto pipeline = ggml_metal_library_get_pipeline_pad(lib, op); - const int nth = std::min(1024, ne0); + if (pipeline.c4) { + args.ne00 = ne00/4; + args.ne0 = ne0/4; + } + + const int nth_max = MIN(64, ggml_metal_pipeline_max_theads_per_threadgroup(pipeline)); + const int nth = MIN(args.ne0, nth_max); + const int nk0 = (args.ne0 + 1024 - 1)/1024; // note: 1024 is hardcoded in the kernel! ggml_metal_encoder_set_pipeline(enc, pipeline); ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0); ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[0]), 1); ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op), 2); - ggml_metal_encoder_dispatch_threadgroups(enc, ne1, ne2, ne3, nth, 1, 1); + ggml_metal_encoder_dispatch_threadgroups(enc, nk0*ne1, ne2, ne3, nth, 1, 1); return 1; } diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal index f6ffb2b3a1c..4cf9dbea946 100644 --- a/ggml/src/ggml-metal/ggml-metal.metal +++ b/ggml/src/ggml-metal/ggml-metal.metal @@ -2643,7 +2643,7 @@ kernel void kernel_gated_delta_net_impl( b_ptr += args.ne21; g_ptr += args.ne21*G; - if (K > 1u) { + if (K > 1) { const int target_slot = (int)t - shift; if (target_slot >= 0 && target_slot < (int)K) { device float * dst_state = (device float *) (dst) + attn_size + (uint)target_slot * state_size_per_snap + state_out_base; @@ -2655,7 +2655,7 @@ kernel void kernel_gated_delta_net_impl( } } - if (K == 1u) { + if (K == 1) { device float * dst_state = (device float *) (dst) + attn_size + state_out_base; FOR_UNROLL (short j = 0; j < NSG; j++) { const short is = tx*NSG + j; @@ -5104,7 +5104,7 @@ kernel void kernel_upscale_bilinear_f32( for (int64_t sx = x_min; sx < x_max; ++sx) { const float wx = MAX(0.0f, 1.0f - fabs((float)sx - f00) * invscale0); const float w = wx * wy; - const device const float * src_ptr = (device const float *)(src0 + sy*args.nb01 + sx*args.nb00); + device const float * src_ptr = (device const float *)(src0 + sy*args.nb01 + sx*args.nb00); sum += (*src_ptr) * w; wsum += w; } @@ -5286,7 +5286,7 @@ kernel void kernel_upscale_bicubic_f32( const int64_t ix = MAX(0, MIN(args.ne00 - 1, i00 + dx)); const float wx = (dx == -1) ? w_x0 : (dx == 0) ? w_x1 : (dx == 1) ? w_x2 : w_x3; - const device const float * src_ptr = (device const float *)(src_slice + iy * args.nb01 + ix * args.nb00); + device const float * src_ptr = (device const float *)(src_slice + iy * args.nb01 + ix * args.nb00); sum += (*src_ptr) * wx * wy; } } @@ -5329,42 +5329,46 @@ kernel void kernel_roll_f32( } } -kernel void kernel_pad_f32( +template +kernel void kernel_pad_impl( constant ggml_metal_kargs_pad & args, device const char * src0, device char * dst, uint3 tgpig[[threadgroup_position_in_grid]], uint3 tpitg[[thread_position_in_threadgroup]], uint3 ntg[[threads_per_threadgroup]]) { + const int32_t i3 = tgpig.z; + const int32_t i2 = tgpig.y; + const int32_t k0 = tgpig.x/args.ne1; + const int32_t i1 = tgpig.x - k0*args.ne1; - const int64_t i3 = tgpig.z; - const int64_t i2 = tgpig.y; - const int64_t i1 = tgpig.x; + const int32_t i03 = i3; + const int32_t i02 = i2; + const int32_t i01 = i1; - const int64_t i03 = i3; - const int64_t i02 = i2; - const int64_t i01 = i1; - - device const float * src0_ptr = (device const float *) (src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01); - device float * dst_ptr = (device float *) (dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1); + device const T * src0_ptr = (device const T *) (src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01); + device T * dst_ptr = (device T *) (dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1); - if (i1 < args.ne01 && i2 < args.ne02 && i3 < args.ne03) { - for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) { - if (i0 < args.ne00) { - dst_ptr[i0] = src0_ptr[i0]; - } else { - dst_ptr[i0] = 0.0f; - } + for (int32_t l0 = 0; l0 < 1024; l0 += ntg.x) { + const int32_t i0 = k0*1024 + tpitg.x + l0; + if (i0 >= args.ne0) { + break; } - return; - } - - for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) { - dst_ptr[i0] = 0.0f; + if (i0 < args.ne00 && i1 < args.ne01 && i2 < args.ne02 && i3 < args.ne03) { + dst_ptr[i0] = src0_ptr[i0]; + } else { + dst_ptr[i0] = 0.0f; + } } } +typedef decltype(kernel_pad_impl) kernel_pad_t; + +template [[host_name("kernel_pad_f32")]] kernel kernel_pad_t kernel_pad_impl; +template [[host_name("kernel_pad_f32_4")]] kernel kernel_pad_t kernel_pad_impl; + +// TODO: this is slow - optimize kernel void kernel_pad_reflect_1d_f32( constant ggml_metal_kargs_pad_reflect_1d & args, device const char * src0, @@ -7328,23 +7332,27 @@ kernel void kernel_cpy_t_t( device const char * src0, device char * dst, uint3 tgpig[[threadgroup_position_in_grid]], - ushort tiitg[[thread_index_in_threadgroup]], + ushort3 tpitg[[thread_position_in_threadgroup]], ushort3 ntg[[threads_per_threadgroup]]) { - const int i03 = tgpig[2]; - const int i02 = tgpig[1]; - const int i01 = ntg[1] == 1 ? tgpig[0]%args.ne01 : tgpig[0]*ntg[1] + tiitg/ntg[0]; - const int iw0 = ntg[1] == 1 ? tgpig[0]/args.ne01 : 0; + const int32_t i03 = tgpig[2]; + const int32_t i02 = tgpig[1]; + const int32_t i01 = ntg[1] == 1 ? tgpig[0]%args.ne01 : tgpig[0]*ntg[1] + tpitg.y; + const int32_t iw0 = ntg[1] == 1 ? tgpig[0]/args.ne01 : 0; + + if (i01 >= args.ne01) { + return; + } const int64_t n = i03*args.ne02*args.ne01*args.ne00 + i02*args.ne01*args.ne00 + i01*args.ne00; - const int64_t i3 = n/(args.ne2*args.ne1*args.ne0); - const int64_t i2 = (n - i3*args.ne2*args.ne1*args.ne0)/(args.ne1*args.ne0); - const int64_t i1 = (n - i3*args.ne2*args.ne1*args.ne0 - i2*args.ne1*args.ne0)/args.ne0; - const int64_t i0 = (n - i3*args.ne2*args.ne1*args.ne0 - i2*args.ne1*args.ne0 - i1*args.ne0); + const int32_t i3 = n/(args.ne2*args.ne1*args.ne0); + const int32_t i2 = (n - i3*args.ne2*args.ne1*args.ne0)/(args.ne1*args.ne0); + const int32_t i1 = (n - i3*args.ne2*args.ne1*args.ne0 - i2*args.ne1*args.ne0)/args.ne0; + const int32_t i0 = (n - i3*args.ne2*args.ne1*args.ne0 - i2*args.ne1*args.ne0 - i1*args.ne0); device T1 * dst_data = (device T1 *) (dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1 + i0*args.nb0); - for (int64_t i00 = iw0*ntg[0] + tiitg%ntg[0]; i00 < args.ne00; ) { + for (int32_t i00 = iw0*ntg[0] + tpitg.x; i00 < args.ne00;) { device const T0 * src = (device T0 *)(src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01 + i00*args.nb00); dst_data[i00] = (T1) src[0]; break; @@ -7376,23 +7384,27 @@ kernel void kernel_cpy_f32_q( device const char * src0, device char * dst, uint3 tgpig[[threadgroup_position_in_grid]], - ushort tiitg[[thread_index_in_threadgroup]], + ushort3 tpitg[[thread_position_in_threadgroup]], ushort3 ntg[[threads_per_threadgroup]]) { - const int i03 = tgpig[2]; - const int i02 = tgpig[1]; - const int i01 = ntg[1] == 1 ? tgpig[0]%args.ne01 : tgpig[0]*ntg[1] + tiitg/ntg[0]; - const int iw0 = ntg[1] == 1 ? tgpig[0]/args.ne01 : 0; + const int32_t i03 = tgpig[2]; + const int32_t i02 = tgpig[1]; + const int32_t i01 = ntg[1] == 1 ? tgpig[0]%args.ne01 : tgpig[0]*ntg[1] + tpitg.y; + const int32_t iw0 = ntg[1] == 1 ? tgpig[0]/args.ne01 : 0; + + if (i01 >= args.ne01) { + return; + } const int64_t n = i03*args.ne02*args.ne01*args.ne00 + i02*args.ne01*args.ne00 + i01*args.ne00; - const int64_t i3 = n / (args.ne2*args.ne1*args.ne0); - const int64_t i2 = (n - i3*args.ne2*args.ne1*args.ne0) / (args.ne1*args.ne0); - const int64_t i1 = (n - i3*args.ne2*args.ne1*args.ne0 - i2*args.ne1*args.ne0) / args.ne0; - const int64_t i0 = (n - i3*args.ne2*args.ne1*args.ne0 - i2*args.ne1*args.ne0 - i1*args.ne0)/QK; + const int32_t i3 = n / (args.ne2*args.ne1*args.ne0); + const int32_t i2 = (n - i3*args.ne2*args.ne1*args.ne0) / (args.ne1*args.ne0); + const int32_t i1 = (n - i3*args.ne2*args.ne1*args.ne0 - i2*args.ne1*args.ne0) / args.ne0; + const int32_t i0 = (n - i3*args.ne2*args.ne1*args.ne0 - i2*args.ne1*args.ne0 - i1*args.ne0)/QK; device block_q * dst_data = (device block_q *)(dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1 + i0*args.nb0); - for (int64_t i00 = iw0*ntg[0] + tiitg%ntg[0]; i00 < args.nk0; ) { + for (int32_t i00 = iw0*ntg[0] + tpitg.x; i00 < args.nk0;) { device const float * src = (device const float *)(src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01 + (i00*QK)*args.nb00); quantize_func(src, dst_data[i00]); @@ -7417,24 +7429,28 @@ kernel void kernel_cpy_q_f32( device const char * src0, device char * dst, uint3 tgpig[[threadgroup_position_in_grid]], - ushort tiitg[[thread_index_in_threadgroup]], + ushort3 tpitg[[thread_position_in_threadgroup]], ushort3 ntg[[threads_per_threadgroup]]) { - const int i03 = tgpig[2]; - const int i02 = tgpig[1]; - const int i01 = ntg[1] == 1 ? tgpig[0]%args.ne01 : tgpig[0]*ntg[1] + tiitg/ntg[0]; - const int iw0 = ntg[1] == 1 ? tgpig[0]/args.ne01 : 0; + const int32_t i03 = tgpig[2]; + const int32_t i02 = tgpig[1]; + const int32_t i01 = ntg[1] == 1 ? tgpig[0]%args.ne01 : tgpig[0]*ntg[1] + tpitg.y; + const int32_t iw0 = ntg[1] == 1 ? tgpig[0]/args.ne01 : 0; + + if (i01 >= args.ne01) { + return; + } const int64_t n = i03*args.ne02*args.ne01*args.ne00 + i02*args.ne01*args.ne00 + i01*args.ne00; - const int64_t i3 = n/(args.ne2*args.ne1*args.ne0); - const int64_t i2 = (n - i3*args.ne2*args.ne1*args.ne0)/(args.ne1*args.ne0); - const int64_t i1 = (n - i3*args.ne2*args.ne1*args.ne0 - i2*args.ne1*args.ne0)/args.ne0; - const int64_t i0 = (n - i3*args.ne2*args.ne1*args.ne0 - i2*args.ne1*args.ne0 - i1*args.ne0); + const int32_t i3 = n/(args.ne2*args.ne1*args.ne0); + const int32_t i2 = (n - i3*args.ne2*args.ne1*args.ne0)/(args.ne1*args.ne0); + const int32_t i1 = (n - i3*args.ne2*args.ne1*args.ne0 - i2*args.ne1*args.ne0)/args.ne0; + const int32_t i0 = (n - i3*args.ne2*args.ne1*args.ne0 - i2*args.ne1*args.ne0 - i1*args.ne0); device const block_q * src_data = (device const block_q *)(src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01); device T4x4 * dst_data = (device T4x4 *)(dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1 + i0*args.nb0); - for (int64_t i00 = iw0*ntg[0] + tiitg%ntg[0]; i00 < args.nk0; ) { + for (int32_t i00 = iw0*ntg[0] + tpitg.x; i00 < args.nk0;) { T4x4 temp; dequantize_func(src_data + i00/nl, i00%nl, temp); dst_data[i00] = temp; diff --git a/src/models/delta-net-base.cpp b/src/models/delta-net-base.cpp index a67238383ed..4f4c7cac7a8 100644 --- a/src/models/delta-net-base.cpp +++ b/src/models/delta-net-base.cpp @@ -562,13 +562,13 @@ ggml_tensor * llm_build_delta_net_base::build_recurrent_attn( } const int64_t D = S_v * S_v * H_v; - const int64_t K = (int64_t) cparams.n_rs_seq + 1; + const int64_t K = cparams.n_rs_seq + 1; // TODO: remove pad + simplify - ggml_tensor * state_in_3d = ggml_reshape_3d(ctx0, s, D, 1, n_seqs); - ggml_tensor * state_3d = ggml_pad(ctx0, state_in_3d, 0, K - 1, 0, 0); + ggml_tensor * s_3d = ggml_reshape_3d(ctx0, s, D, 1, n_seqs); + ggml_tensor * s_3d_pad = ggml_pad (ctx0, s_3d, 0, K - 1, 0, 0); - ggml_tensor * gdn_out = ggml_gated_delta_net(ctx0, q, k, v, g, b, state_3d); + ggml_tensor * gdn_out = ggml_gated_delta_net(ctx0, q, k, v, g, b, s_3d_pad); if (n_seq_tokens > 1) { cb(gdn_out, LLAMA_TENSOR_NAME_FGDN_CH, il); } else { diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 9727a738ed8..abad4afe9f0 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -162,8 +162,14 @@ struct clip_ctx { bool debug_output_embeddings = false; + // for measuring memory usage + bool no_alloc = false; + std::map mem_usage; + std::map mem_compute; + clip_ctx(clip_context_params & ctx_params) { flash_attn_type = ctx_params.flash_attn_type; + no_alloc = ctx_params.no_alloc; backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr); if (!backend_cpu) { throw std::runtime_error("failed to initialize CPU backend"); @@ -1688,6 +1694,8 @@ struct clip_model_loader { ggml_set_name(data_tensor, cur->name); loaded_tensor_names.insert(name); cur = data_tensor; + // add to weight memory counter + ctx_clip.mem_usage[ggml_backend_get_device(ctx_clip.backend)] += ggml_nbytes(cur); } return cur; }; @@ -2602,7 +2610,7 @@ struct clip_model_loader { } // load data - { + if (!ctx_clip.no_alloc) { std::vector read_buf; // alloc memory and offload data @@ -2676,7 +2684,7 @@ struct clip_model_loader { if (ctx_clip.flash_attn_type == CLIP_FLASH_ATTN_TYPE_AUTO) { // try to enable flash attention to see if it's supported ctx_clip.flash_attn_type = CLIP_FLASH_ATTN_TYPE_ENABLED; - info = alloc_compute_meta(ctx_clip, batch); + info = reserve_compute_meta(ctx_clip, batch); if (!info.fattn && info.fattn_op) { auto op = info.fattn_op; LOG_WRN("%s: *****************************************************************\n", __func__); @@ -2695,10 +2703,10 @@ struct clip_model_loader { LOG_WRN("%s: please report this on github as an issue\n", __func__); LOG_WRN("%s: *****************************************************************\n", __func__); ctx_clip.flash_attn_type = CLIP_FLASH_ATTN_TYPE_DISABLED; - alloc_compute_meta(ctx_clip, batch); + reserve_compute_meta(ctx_clip, batch); } } else { - info = alloc_compute_meta(ctx_clip, batch); + info = reserve_compute_meta(ctx_clip, batch); if (!info.fattn && ctx_clip.flash_attn_type == CLIP_FLASH_ATTN_TYPE_ENABLED) { LOG_WRN("%s: flash attention is not supported by the current backend; falling back to CPU (performance will be degraded)\n", __func__); } @@ -2737,12 +2745,14 @@ struct clip_model_loader { } } - static support_info_graph alloc_compute_meta(clip_ctx & ctx_clip, const clip_image_f32_batch & batch) { + // only initialize backend buffers, but do not allocate them yet + static support_info_graph reserve_compute_meta(clip_ctx & ctx_clip, const clip_image_f32_batch & batch) { ctx_clip.buf_compute_meta.resize(ctx_clip.max_nodes * ggml_tensor_overhead() + ggml_graph_overhead()); ggml_cgraph * gf = clip_image_build_graph(&ctx_clip, batch); ggml_backend_sched_reserve(ctx_clip.sched.get(), gf); + ctx_clip.mem_compute.clear(); for (size_t i = 0; i < ctx_clip.backend_ptrs.size(); ++i) { ggml_backend_t backend = ctx_clip.backend_ptrs[i]; ggml_backend_buffer_type_t buft = ctx_clip.backend_buft[i]; @@ -2752,6 +2762,7 @@ struct clip_model_loader { ggml_backend_buft_name(buft), size / 1024.0 / 1024.0); } + ctx_clip.mem_compute[ggml_backend_get_device(backend)] += size; } const int n_splits = ggml_backend_sched_get_n_splits(ctx_clip.sched.get()); @@ -4266,22 +4277,6 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) { } } -int clip_is_minicpmv(const struct clip_ctx * ctx) { - // TODO: remove this function - if (ctx->proj_type() == PROJECTOR_TYPE_MINICPMV) { - return ctx->model.hparams.minicpmv_version; - } - if (ctx->proj_type() == PROJECTOR_TYPE_MINICPMV4_6) { - return 46; - } - return 0; -} - -bool clip_is_glm(const struct clip_ctx * ctx) { - // TODO: remove this function - return ctx->proj_type() == PROJECTOR_TYPE_GLM_EDGE; -} - bool clip_is_llava(const struct clip_ctx * ctx) { return ctx->model.hparams.has_llava_projector; } @@ -4330,6 +4325,14 @@ const clip_hparams * clip_get_hparams(const struct clip_ctx * ctx) { return &ctx->model.hparams; } +std::map clip_get_mem_usage(const struct clip_ctx * ctx) { + std::map result = ctx->mem_usage; + for (auto & [dev, size] : ctx->mem_compute) { + result[dev] += size; + } + return result; +} + // // API for debugging // diff --git a/tools/mtmd/clip.h b/tools/mtmd/clip.h index f643ed6e979..9b807ffa77b 100644 --- a/tools/mtmd/clip.h +++ b/tools/mtmd/clip.h @@ -6,6 +6,8 @@ #include #include +#include + // !!! Internal header, to be used by mtmd only !!! #define MTMD_INTERNAL_HEADER @@ -40,6 +42,7 @@ struct clip_context_params { bool warmup; ggml_backend_sched_eval_callback cb_eval; void * cb_eval_user_data; + bool no_alloc; }; struct clip_init_result { @@ -102,8 +105,6 @@ struct ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx); bool clip_image_encode (struct clip_ctx * ctx, int n_threads, struct clip_image_f32 * img, float * vec); bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct clip_image_f32_batch * imgs, float * vec); -int clip_is_minicpmv(const struct clip_ctx * ctx); -bool clip_is_glm(const struct clip_ctx * ctx); bool clip_is_llava(const struct clip_ctx * ctx); // note for contributor: this clip_is_(model) pattern is deprecated // do NOT add new functions like this @@ -116,6 +117,8 @@ void clip_image_f32_batch_add_mel(struct clip_image_f32_batch * batch, int n_mel bool clip_has_vision_encoder(const struct clip_ctx * ctx); bool clip_has_audio_encoder(const struct clip_ctx * ctx); +std::map clip_get_mem_usage(const struct clip_ctx * ctx); + struct clip_cap { bool has_vision; bool has_audio; diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp index 8f12d0b43ea..ce047d11410 100644 --- a/tools/mtmd/mtmd.cpp +++ b/tools/mtmd/mtmd.cpp @@ -21,6 +21,7 @@ #include #include #include +#include #include // represents raw image data, layout is RGBRGBRGB... @@ -139,13 +140,13 @@ mtmd_context_params mtmd_context_params_default() { struct mtmd_context { struct clip_ctx * ctx_v; // vision struct clip_ctx * ctx_a; // audio - const struct llama_model * text_model; std::vector image_embd_v; // image embedding vector bool print_timings; int n_threads; std::string media_marker; - const int n_embd_text; + const int n_embd_text = -1; // -1 means llm context not provided, skip checking this + const llama_vocab * vocab = nullptr; // can be nullptr if text_model is not provided mtmd_pos_type pos_type; // these are not token, but strings used to mark the beginning and end of image/audio embeddings @@ -178,12 +179,13 @@ struct mtmd_context { mtmd_context(const char * mmproj_fname, const llama_model * text_model, - const mtmd_context_params & ctx_params) : - text_model (text_model), + const mtmd_context_params & ctx_params, + bool no_alloc = false) : print_timings(ctx_params.print_timings), n_threads (ctx_params.n_threads), media_marker (ctx_params.media_marker), - n_embd_text (llama_model_n_embd_inp(text_model)) + n_embd_text (text_model ? llama_model_n_embd_inp(text_model) : -1), + vocab (text_model ? llama_model_get_vocab(text_model) : nullptr) { if (ctx_params.image_marker != nullptr) { throw std::runtime_error("custom image_marker is not supported anymore, use media_marker instead"); @@ -193,21 +195,23 @@ struct mtmd_context { throw std::runtime_error("media_marker must not be empty"); } - auto decoder_rope_type = llama_model_rope_type(text_model); - switch (decoder_rope_type) { - case LLAMA_ROPE_TYPE_NONE: - case LLAMA_ROPE_TYPE_NORM: - case LLAMA_ROPE_TYPE_NEOX: - { - pos_type = MTMD_POS_TYPE_NORMAL; - } break; - case LLAMA_ROPE_TYPE_MROPE: - case LLAMA_ROPE_TYPE_IMROPE: - { - pos_type = MTMD_POS_TYPE_MROPE; - } break; - default: - throw std::runtime_error(string_format("unsupported decoder rope type: %d\n", decoder_rope_type)); + if (text_model) { + auto decoder_rope_type = llama_model_rope_type(text_model); + switch (decoder_rope_type) { + case LLAMA_ROPE_TYPE_NONE: + case LLAMA_ROPE_TYPE_NORM: + case LLAMA_ROPE_TYPE_NEOX: + { + pos_type = MTMD_POS_TYPE_NORMAL; + } break; + case LLAMA_ROPE_TYPE_MROPE: + case LLAMA_ROPE_TYPE_IMROPE: + { + pos_type = MTMD_POS_TYPE_MROPE; + } break; + default: + throw std::runtime_error(string_format("unsupported decoder rope type: %d\n", decoder_rope_type)); + } } clip_context_params ctx_clip_params { @@ -218,6 +222,7 @@ struct mtmd_context { /* warmup */ ctx_params.warmup, /* cb_eval */ ctx_params.cb_eval, /* cb_eval_user_data */ ctx_params.cb_eval_user_data, + /* no_alloc */ no_alloc, }; auto res = clip_init(mmproj_fname, ctx_clip_params); @@ -241,7 +246,7 @@ struct mtmd_context { // since we already validate n_embd of vision and audio mmproj, // we can safely assume that they are the same int n_embd_clip = clip_n_mmproj_embd(ctx_v ? ctx_v : ctx_a); - if (n_embd_text != n_embd_clip) { + if (n_embd_text > 0 && n_embd_text != n_embd_clip) { throw std::runtime_error(string_format( "mismatch between text model (n_embd = %d) and mmproj (n_embd = %d)\n" "hint: you may be using wrong mmproj\n", @@ -279,7 +284,7 @@ struct mtmd_context { } break; case PROJECTOR_TYPE_MINICPMV: { - int minicpmv_version = clip_is_minicpmv(ctx_v); + int minicpmv_version = clip_get_hparams(ctx_v)->minicpmv_version; if (minicpmv_version == 2) { // minicpmv 2.5 format: // (overview) (slice) (slice) \n ... @@ -594,7 +599,11 @@ struct mtmd_context { private: llama_token lookup_token(const std::string & token_text) { - const llama_vocab * vocab = llama_model_get_vocab(text_model); + if (vocab == nullptr) { + // TODO @ngxson : this case is currently hit by mtmd_get_memory_usage + // but we should reconsider this if this case is needed in other places in the future + return LLAMA_TOKEN_NULL; + } const int n_vocab = llama_vocab_n_tokens(vocab); for (int i = 0; i < n_vocab; i++) { if (token_to_piece(vocab, i, true) == token_text) { @@ -605,6 +614,9 @@ struct mtmd_context { } std::string token_to_piece(const llama_vocab * vocab, llama_token token, bool special) { + if (vocab == nullptr) { + throw std::runtime_error("llama_vocab is not provided"); + } std::string piece; piece.resize(piece.capacity()); // using string internal cache, 15 bytes + '\n' const int n_chars = llama_token_to_piece(vocab, token, &piece[0], piece.size(), 0, special); @@ -653,7 +665,7 @@ struct mtmd_tokenizer { add_special = text->add_special; parse_special = text->parse_special; input_text = text->text; - vocab = llama_model_get_vocab(ctx->text_model); + vocab = ctx->vocab; } int32_t tokenize(mtmd_input_chunks * output) { @@ -679,27 +691,29 @@ struct mtmd_tokenizer { } } - if (add_special && llama_vocab_get_add_bos(vocab)) { - // if first chunk is text, we add BOS token to first text chunk - // otherwise, create a new text chunk with BOS token - if (!cur.entries.empty() && cur.entries[0].type == MTMD_INPUT_CHUNK_TYPE_TEXT) { - // add BOS token to the beginning of first text chunk - cur.entries[0].tokens_text.insert(cur.entries[0].tokens_text.begin(), llama_vocab_bos(vocab)); - } else { - // create a new text chunk with BOS token at the beginning - mtmd_input_chunk bos_chunk{ - MTMD_INPUT_CHUNK_TYPE_TEXT, - {llama_vocab_bos(vocab)}, - nullptr, // image tokens - nullptr, // audio tokens - }; - cur.entries.insert(cur.entries.begin(), std::move(bos_chunk)); + if (vocab != nullptr) { + if (add_special && llama_vocab_get_add_bos(vocab)) { + // if first chunk is text, we add BOS token to first text chunk + // otherwise, create a new text chunk with BOS token + if (!cur.entries.empty() && cur.entries[0].type == MTMD_INPUT_CHUNK_TYPE_TEXT) { + // add BOS token to the beginning of first text chunk + cur.entries[0].tokens_text.insert(cur.entries[0].tokens_text.begin(), llama_vocab_bos(vocab)); + } else { + // create a new text chunk with BOS token at the beginning + mtmd_input_chunk bos_chunk{ + MTMD_INPUT_CHUNK_TYPE_TEXT, + {llama_vocab_bos(vocab)}, + nullptr, // image tokens + nullptr, // audio tokens + }; + cur.entries.insert(cur.entries.begin(), std::move(bos_chunk)); + } } - } - if (add_special && llama_vocab_get_add_eos(vocab)) { - // if last chunk is text, we add EOS token to it - add_text({llama_vocab_eos(vocab)}); + if (add_special && llama_vocab_get_add_eos(vocab)) { + // if last chunk is text, we add EOS token to it + add_text({llama_vocab_eos(vocab)}); + } } if (i_bm != bitmaps.size()) { @@ -714,6 +728,9 @@ struct mtmd_tokenizer { } void add_text(const std::string & txt, bool parse_special) { + if (vocab == nullptr) { + throw std::runtime_error("llama_vocab is not provided"); + } LOG_DBG("%s: %s\n", __func__, txt.c_str()); auto tokens = mtmd_tokenize_text_internal(vocab, txt, /* add_special */ false, parse_special); add_text(tokens); @@ -1002,10 +1019,16 @@ struct mtmd_tokenizer { const std::string & text, bool add_special, bool parse_special) { + if (vocab == nullptr) { + throw std::runtime_error("llama_vocab is not provided"); + } // upper limit for the number of tokens int n_tokens = text.length() + 2 * add_special; std::vector result(n_tokens); n_tokens = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special); + if (n_tokens == std::numeric_limits::min()) { + throw std::runtime_error("Tokenization failed: input text too large, tokenization result exceeds int32_t limit"); + } if (n_tokens < 0) { result.resize(-n_tokens); int check = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special); @@ -1067,8 +1090,8 @@ int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens) bool ok = false; if (clip_is_llava(ctx_clip) - || clip_is_minicpmv(ctx_clip) - || clip_is_glm(ctx_clip) + || proj_type == PROJECTOR_TYPE_MINICPMV + || proj_type == PROJECTOR_TYPE_GLM_EDGE || proj_type == PROJECTOR_TYPE_INTERNVL) { // TODO @ngxson : llava does not support batched encoding ; this should be fixed inside clip_image_batch_encode() const auto & entries = image_tokens->batch_f32.entries; @@ -1542,3 +1565,36 @@ void mtmd_debug_preprocess_audio(mtmd_context * ctx, const std::vector & } } } + +static void stub_log_callback(enum ggml_log_level, const char *, void *) { + // do nothing +} + +std::map mtmd_get_memory_usage(const char * mmproj_fname, + struct mtmd_context_params ctx_params) { + mtmd::context_ptr ctx; + auto saved_log_callback = g_logger_state.log_callback; + auto saved_log_user_data = g_logger_state.log_callback_user_data; + try { + mtmd_log_set(stub_log_callback, nullptr); // suppress logging + ctx.reset(new mtmd_context(mmproj_fname, nullptr, ctx_params)); + mtmd_log_set(saved_log_callback, saved_log_user_data); // restore log callback + std::map total_mem; + auto merge = [&](const struct clip_ctx * c) { + for (auto & [dev, size] : clip_get_mem_usage(c)) { + total_mem[dev] += size; + } + }; + if (ctx->ctx_v) { + merge(ctx->ctx_v); + } + if (ctx->ctx_a) { + merge(ctx->ctx_a); + } + return total_mem; + } catch (const std::exception & e) { + mtmd_log_set(saved_log_callback, saved_log_user_data); // restore log callback + LOG_ERR("%s: error: %s\n", __func__, e.what()); + return {}; + } +} diff --git a/tools/mtmd/mtmd.h b/tools/mtmd/mtmd.h index 54b9515a3ea..5d518df799e 100644 --- a/tools/mtmd/mtmd.h +++ b/tools/mtmd/mtmd.h @@ -9,6 +9,7 @@ #include #ifdef __cplusplus +#include #include #include #include @@ -261,6 +262,14 @@ MTMD_API mtmd_input_chunks * mtmd_test_create_input_chunks(void); } // extern "C" #endif +// Get memory usage of the current model in bytes, per backend device +// Note: this is an unstable API, used internally by fit_params; it WILL be removed or changed without deprecation +#ifdef __cplusplus +MTMD_API std::map mtmd_get_memory_usage( + const char * mmproj_fname, + struct mtmd_context_params ctx_params); +#endif + // // C++ wrappers // diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index dc3189e1705..f517310266c 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -746,6 +746,46 @@ struct server_context_impl { params_base = params; + std::string & mmproj_path = params_base.mmproj.path; + bool has_mmproj = !mmproj_path.empty(); + mtmd_context_params mparams = mtmd_context_params_default(); + if (has_mmproj) { + mparams.use_gpu = params_base.mmproj_use_gpu; + mparams.print_timings = false; + mparams.n_threads = params_base.cpuparams.n_threads; + mparams.flash_attn_type = params_base.flash_attn_type; + mparams.warmup = params_base.warmup; + mparams.image_min_tokens = params_base.image_min_tokens; + mparams.image_max_tokens = params_base.image_max_tokens; + mparams.media_marker = get_media_marker(); + } + + // optionally get the memory usage of mmproj + if (has_mmproj && params_base.fit_params) { + auto mmproj_mem = mtmd_get_memory_usage(mmproj_path.c_str(), mparams); + if (!mmproj_mem.empty()) { + size_t total = 0; + for (auto & [dev, size] : mmproj_mem) { + total += size; + } + SRV_INF("[mtmd] estimated memory usage of mmproj is %.2f MiB\n", total / (1024.0 * 1024.0)); + GGML_ASSERT(!params_base.fit_params_target.empty()); + for (auto & [dev, size] : mmproj_mem) { + for (size_t i = 0; i < ggml_backend_dev_count(); i++) { + if (ggml_backend_dev_get(i) == dev) { + if (i < params_base.fit_params_target.size()) { + SRV_DBG("[mtmd] adding %.2f MiB to fit_params_target for device %s\n", size / (1024.0 * 1024.0), ggml_backend_dev_name(dev)); + params_base.fit_params_target[i] += size; + } + break; + } + } + } + } else { + SRV_ERR("%s", "[mtmd] failed to get memory usage of mmproj\n"); + } + } + llama_init = common_init_from_params(params_base); model_tgt = llama_init->model(); @@ -830,18 +870,10 @@ struct server_context_impl { params_base.speculative.draft.ctx_dft = ctx_dft.get(); } - std::string & mmproj_path = params_base.mmproj.path; - if (!mmproj_path.empty()) { - mtmd_context_params mparams = mtmd_context_params_default(); - - mparams.use_gpu = params_base.mmproj_use_gpu; - mparams.print_timings = false; - mparams.n_threads = params_base.cpuparams.n_threads; - mparams.flash_attn_type = params_base.flash_attn_type; - mparams.warmup = params_base.warmup; - mparams.image_min_tokens = params_base.image_min_tokens; - mparams.image_max_tokens = params_base.image_max_tokens; - mparams.media_marker = get_media_marker(); + if (has_mmproj) { + if (!is_resume) { + mtmd_helper_log_set(common_log_default_callback, nullptr); + } mctx = mtmd_init_from_file(mmproj_path.c_str(), model_tgt, mparams); if (mctx == nullptr) { diff --git a/tools/ui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionAdd/ChatFormActionsAdd.svelte b/tools/ui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionAdd/ChatFormActionsAdd.svelte index 54ddcf9b0a7..6a91bf90524 100644 --- a/tools/ui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionAdd/ChatFormActionsAdd.svelte +++ b/tools/ui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionAdd/ChatFormActionsAdd.svelte @@ -1,5 +1,5 @@ {#if isMobile.current} diff --git a/tools/ui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionModels.svelte b/tools/ui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionModels.svelte index 29702060565..07f079f5b51 100644 --- a/tools/ui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionModels.svelte +++ b/tools/ui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionModels.svelte @@ -3,7 +3,7 @@ import { modelsStore, modelOptions, selectedModelId } from '$lib/stores/models.svelte'; import { isRouterMode, serverError } from '$lib/stores/server.svelte'; import { ModelsSelectorDropdown, ModelsSelectorSheet } from '$lib/components/app'; - import { IsMobile } from '$lib/hooks/is-mobile.svelte'; + import { isMobile } from '$lib/stores/viewport.svelte'; import { activeMessages } from '$lib/stores/conversations.svelte'; interface Props { @@ -152,8 +152,6 @@ let selectorModelRef: ModelsSelectorDropdown | ModelsSelectorSheet | undefined = $state(undefined); - let isMobile = new IsMobile(); - export function open() { selectorModelRef?.open(); } diff --git a/tools/ui/src/lib/components/app/chat/ChatScreen/ChatScreenActionScrollDown.svelte b/tools/ui/src/lib/components/app/chat/ChatScreen/ChatScreenActionScrollDown.svelte index bbfed95e948..c43bee3e3c3 100644 --- a/tools/ui/src/lib/components/app/chat/ChatScreen/ChatScreenActionScrollDown.svelte +++ b/tools/ui/src/lib/components/app/chat/ChatScreen/ChatScreenActionScrollDown.svelte @@ -41,12 +41,16 @@ }); -
+