diff --git a/CLAUDE.md b/CLAUDE.md index e654df78..69d41dfe 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -6,7 +6,7 @@ This file provides guidance to Claude Code (claude.ai/code) when working with co Java bindings for [llama.cpp](https://github.com/ggerganov/llama.cpp) via JNI, providing a high-level API for LLM inference in Java. The Java layer communicates with a native C++ library through JNI. -Current llama.cpp pinned version: **b9682** +Current llama.cpp pinned version: **b9739** ## Upgrading CUDA Version @@ -193,7 +193,7 @@ needs no extra step here, `build-webui` re-reads the tag and rebuilds the matchi ships no UI): ```bash # needs node/npm + network; embed.cpp is plain C++17 (no npm) -git clone --depth 1 --branch b9682 https://github.com/ggml-org/llama.cpp /tmp/lc +git clone --depth 1 --branch b9739 https://github.com/ggml-org/llama.cpp /tmp/lc ( cd /tmp/lc/tools/ui && npm ci && npm run build \ && ( cd dist && find . -type f -not -path './_gzip/*' \ | while read -r f; do mkdir -p "_gzip/$(dirname "$f")"; gzip -9 -c "$f" > "_gzip/$f"; done ) \ @@ -227,7 +227,7 @@ plus a cache token are present, `build.sh` adds - `SCCACHE_WEBDAV_TOKEN: ${{ secrets.DEPOT_TOKEN }}` — a Depot **organization** token, stored as the repo secret **`DEPOT_TOKEN`**. -Because `sccache` is **content-addressed** and llama.cpp is pinned (`GIT_TAG b9682`), the +Because `sccache` is **content-addressed** and llama.cpp is pinned (`GIT_TAG b9739`), the ~280 upstream object files are byte-identical every run, so a warm cache recompiles only the *changed* files. Depot's cache is **shared across all branches** (unlike GitHub's per-branch `actions/cache`), so every branch builds incrementally; a `b` version bump @@ -765,7 +765,7 @@ ctest --test-dir build --output-on-failure -R "ResultsToJson" | File | Tests | Scope | |------|-------|-------| | `src/test/cpp/test_utils.cpp` | 156 | Upstream helpers: `server_tokens`, `server_grammar_trigger`, `gen_tool_call_id`, `json_value`, `json_get_nested_values`, UTF-8 helpers, `format_response_rerank`, `format_embeddings_response_oaicompat`, `oaicompat_completion_params_parse`, `oaicompat_chat_params_parse`, `are_lora_equal`, `strip_flag_from_argv`, `token_piece_value`, `json_is_array_and_contains_numbers`, `format_oai_sse`, `format_oai_resp_sse`, `format_anthropic_sse` | -| `src/test/cpp/test_server.cpp` | 188 | Upstream result types: `result_timings`, `task_params::to_json()` (incl. `dry_sequence_breakers`, `preserved_tokens`, `timings_per_token`), `completion_token_output`, `server_task_result_cmpl_partial` (non-oaicompat + `to_json_oaicompat` + logprobs + `to_json_oaicompat_chat` + `to_json_anthropic` + dispatcher), `server_task_result_cmpl_final` (non-oaicompat + `to_json_oaicompat` + `to_json_oaicompat_chat` + `to_json_oaicompat_chat_stream` + `to_json_anthropic` + `to_json_anthropic_stream` + tool_calls + dispatcher), `server_task_result_embd`, `server_task_result_rerank`, `server_task_result_metrics`, `server_task_result_slot_save_load`, `server_task_result_slot_erase`, `server_task_result_apply_lora`, `server_task_result_error`, `format_error_response`, `server_task::need_sampling()`, `server_task::n_tokens()`, `server_task::params_from_json_cmpl()` (parsing pipeline + grammar routing + error paths), `response_fields` projection | +| `src/test/cpp/test_server.cpp` | 188 | Upstream result types: `result_timings`, `task_params::to_json()` (incl. `dry_sequence_breakers`, `preserved_tokens`, `timings_per_token`), `completion_token_output`, `server_task_result_cmpl_partial` (non-oaicompat + `to_json_oaicompat` + logprobs + `to_json_oaicompat_chat` + `to_json_anthropic` + dispatcher), `server_task_result_cmpl_final` (non-oaicompat + `to_json_oaicompat` + `to_json_oaicompat_chat` + `to_json_oaicompat_chat_stream` + `to_json_anthropic` + `to_json_anthropic_stream` + tool_calls + dispatcher), `server_task_result_embd`, `server_task_result_rerank`, `server_task_result_metrics`, `server_task_result_slot_save_load`, `server_task_result_slot_erase`, `server_task_result_apply_lora`, `server_task_result_error`, `format_error_response`, `server_task::need_sampling()`, `server_task::n_tokens()`, `server_schema::eval_llama_cmpl_schema()` (parsing pipeline + grammar routing + error paths), `response_fields` projection | | `src/test/cpp/test_json_helpers.cpp` | 47 | All functions in `json_helpers.hpp`: `get_result_error_message`, `results_to_json`, `rerank_results_to_json`, `parse_encoding_format`, `extract_embedding_prompt`, `is_infill_request`, `parse_slot_prompt_similarity`, `parse_positive_int_config`, `wrap_stream_chunk` | | `src/test/cpp/test_log_helpers.cpp` | 13 | All functions in `log_helpers.hpp`: `log_level_name`, `format_log_as_json` | | `src/test/cpp/test_jni_helpers.cpp` | 41 | All functions in `jni_helpers.hpp` using a zero-filled `JNINativeInterface_` mock | @@ -774,7 +774,7 @@ ctest --test-dir build --output-on-failure -R "ResultsToJson" #### Upstream source location (in CMake build tree) -llama.cpp is fetched via CMake FetchContent, pinned to `GIT_TAG b9682`. +llama.cpp is fetched via CMake FetchContent, pinned to `GIT_TAG b9739`. **GoogleTest** is a separate `BUILD_TESTING`-only FetchContent (`GIT_TAG v1.17.0`), used solely by the `jllama_test` C++ unit-test binary — not by the shipped library, and not coupled to the @@ -877,9 +877,9 @@ f.timings.prompt_n = 5; EXPECT_TRUE(j.contains("timings")); ``` -**3. Parameter parsing (`params_from_json_cmpl`) without a model** +**3. Parameter parsing (`eval_llama_cmpl_schema`) without a model** -`server_task::params_from_json_cmpl(vocab, params_base, n_ctx_slot, logit_bias_eog, data)` +`server_schema::eval_llama_cmpl_schema(vocab, params_base, n_ctx_slot, logit_bias_eog, data)` can be called with `nullptr` vocab **if the JSON does not trigger grammar/preserved_tokens tokenisation** (those are the only vocab-dependent paths). This lets us test the full parsing pipeline including error throws: @@ -891,12 +891,12 @@ const int n_ctx = 512; // test: repeat_last_n=-1 is expanded to n_ctx_slot json data = {{"repeat_last_n", -1}}; -auto p = server_task::params_from_json_cmpl(nullptr, params_base, n_ctx, no_bias, data); +auto p = server_schema::eval_llama_cmpl_schema(nullptr, params_base, n_ctx, no_bias, data); EXPECT_EQ(p.sampling.penalty_last_n, n_ctx); // test: invalid value throws std::runtime_error json bad = {{"dry_sequence_breakers", json::array()}}; // empty → error -EXPECT_THROW(server_task::params_from_json_cmpl(nullptr, params_base, n_ctx, no_bias, bad), +EXPECT_THROW(server_schema::eval_llama_cmpl_schema(nullptr, params_base, n_ctx, no_bias, bad), std::runtime_error); ``` diff --git a/CMakeLists.txt b/CMakeLists.txt index f9cb148d..0578ca7e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -139,7 +139,7 @@ set(LLAMA_BUILD_APP OFF CACHE BOOL "" FORCE) FetchContent_Declare( llama.cpp GIT_REPOSITORY https://github.com/ggerganov/llama.cpp.git - GIT_TAG b9682 + GIT_TAG b9739 ) FetchContent_MakeAvailable(llama.cpp) @@ -270,6 +270,7 @@ target_sources(jllama PRIVATE ${llama.cpp_SOURCE_DIR}/tools/server/server-context.cpp ${llama.cpp_SOURCE_DIR}/tools/server/server-queue.cpp ${llama.cpp_SOURCE_DIR}/tools/server/server-task.cpp + ${llama.cpp_SOURCE_DIR}/tools/server/server-schema.cpp ) if(NOT ANDROID_ABI AND NOT OS_NAME MATCHES "Android") target_sources(jllama PRIVATE diff --git a/README.md b/README.md index 97650d5f..3c5244a8 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ **Build:** ![Java 8+](https://img.shields.io/badge/Java-8%2B-informational) ![Platform](https://img.shields.io/badge/Platform-Linux%20%7C%20macOS%20%7C%20Windows%20%7C%20Android-lightgrey) -[![llama.cpp b9682](https://img.shields.io/badge/llama.cpp-%23b9682-informational)](https://github.com/ggml-org/llama.cpp/releases/tag/b9682) +[![llama.cpp b9739](https://img.shields.io/badge/llama.cpp-%23b9739-informational)](https://github.com/ggml-org/llama.cpp/releases/tag/b9739) [![JPMS](https://img.shields.io/badge/JPMS-modular%20JAR-25A162)](https://openjdk.org/projects/jigsaw/) ![JUnit](https://img.shields.io/badge/tested%20with-JUnit6-25A162) [![JSpecify](https://img.shields.io/badge/JSpecify-1.0.0%20%40NullMarked-25A162)](https://jspecify.dev) diff --git a/docs/history/llama-cpp-breaking-changes.md b/docs/history/llama-cpp-breaking-changes.md index bb1309df..c4c9ec04 100644 --- a/docs/history/llama-cpp-breaking-changes.md +++ b/docs/history/llama-cpp-breaking-changes.md @@ -361,3 +361,15 @@ Used during `llama.cpp` version bumps: when upgrading, scan this file from the r | b9642–b9682 | `common/speculative.{h,cpp}` | Speculative decoding now accumulates per-draft-position acceptance statistics and adds an Eagle3 backend-sampling path (the draft model samples on the compute backend). `common_speculative_*` is compiled into `common` and reached only through the upstream server's speculative slot; the project's C++ references no `speculative`/`draft` symbol. No project source changes required. **New feature:** per-position draft-acceptance metrics — could surface as speculative-decoding telemetry in a future Java API | | b9642–b9682 | `tools/server/server-context.cpp` | Server slot refactored so an `mtmd` (multimodal) prompt can feed a speculative draft model: image/media chunks are routed through the new `mtmd_helper_decode_image_chunk` callback before drafting. Compiled directly into `jllama` (the project builds `server-context/queue/task/models`), but the change is internal to the slot state machine and binds no new/renamed symbol; verified that `jllama.cpp` and the `*_helpers.hpp` headers call none of the touched functions. No project source changes required | | b9642–b9682 | `ggml/src/ggml-*` backends, `tools/` (incl. `llama-bench --offline`), conda-forge packaging, `docs/`, `.github/` | Routine backend kernel updates and tooling/docs/CI tweaks (a new `llama-bench --offline` flag, conda-forge recipe notes). None are compiled into `jllama` beyond the already-built CPU/CUDA/Metal/OpenCL backends, and none change a symbol the project binds. No project changes required | +| b9682–b9739 | `tools/server/server-schema.{h,cpp}` (new) + `tools/server/server-task.{h,cpp}` | **Build-breaking.** `server_task::params_from_json_cmpl()` MOVED to `server_schema::eval_llama_cmpl_schema()` in new `server-schema.h`/`server-schema.cpp`. **Required project changes**: (1) add `server-schema.cpp` to the `target_sources(jllama ...)` block in `CMakeLists.txt`; (2) add `#include "server-schema.h"` in `src/main/cpp/jllama.cpp` and `src/test/cpp/test_server.cpp`; (3) update the call sites in `jllama.cpp:203` and `test_server.cpp:1722` from `server_task::params_from_json_cmpl(...)` to `server_schema::eval_llama_cmpl_schema(...)` | +| b9682–b9739 | `common/common.h` (`common_params_model`) | `common_params_model::name` field REMOVED; replaced by `get_name()` method. Not referenced in project source (model name is read from `server_context_meta::model_name`, populated upstream) — no project source changes required | +| b9682–b9739 | `common/common.h` (`common_params`) | `webui`, `webui_mcp_proxy`, `webui_config_json` fields REMOVED (deprecated aliases; replaced by `ui`/`ui_mcp_proxy`/`ui_config_json` introduced in b9172). Project never references these fields directly — no project source changes required | +| b9682–b9739 | `tools/server/server-models.h` + `server-models.cpp` | `server_state` enum: `SERVER_STATE_LOADING_MODEL` renamed to `SERVER_STATE_LOADING`; new `SERVER_STATE_SLEEPING` added. `on_sleeping_changed` callback replaced by `set_state_callback` with `server_state_callback_t` type. None are referenced in `jllama.cpp` — no project source changes required | +| b9682–b9739 | `vendor/cpp-httplib/httplib.{h,cpp}` | cpp-httplib bumped from v0.47.0 to v0.48.0. Compiled automatically via FetchContent — no project source changes required | +| b9682–b9739 | `common/speculative.{h,cpp}` | New `common_speculative_get_state()` / `common_speculative_set_state()` Eagle3 state checkpointing APIs; `common_prompt_checkpoint::data_spec` field added for Eagle3 speculative draft state stash. Additive; compiled into upstream `common`; project does not call these functions — no project source changes required. **New feature:** Eagle3 speculative decoding state save/restore — could expose later | +| b9682–b9739 | `common/download.h` + `common/download.cpp` | New `common_download_remove()` function for deleting cached model files. Additive; project does not call it — no project source changes required. **New feature:** could be exposed as `LlamaModel.deleteCachedModel(String path)` | +| b9682–b9739 | `common/arg.cpp` | New `--agent` flag that enables all tools + MCP CORS proxy in one step. Server-level CLI flag; not referenced by `ModelParameters` — no project source changes required. **New feature:** consider `ModelParameters.setAgent(boolean)` | +| b9682–b9739 | `common/arg.cpp` + `tools/server/server-http.cpp` | API key file: lines starting with `#` are now treated as comments and ignored. Behaviour fix for existing `ModelParameters.setApiKeyFile(String)` users — upgrade picks it up automatically, no source changes required | +| b9682–b9739 | `ggml/src/ggml-sycl/` | New conv2d, conv2d_dw, conv2d_transpose, conv3d SYCL ops; Q1_0 quantization support. Internal SYCL backend, no project changes required | +| b9682–b9739 | `ggml/src/ggml-cuda/` | New `col2im_1d` CUDA op. Internal CUDA backend, no project changes required | +| b9682–b9739 | `ggml/src/ggml-metal/` | ROPE_BACK Metal support; concat kernel extended to additional types. Internal Metal backend, no project changes required | diff --git a/src/main/cpp/jllama.cpp b/src/main/cpp/jllama.cpp index ad499e20..37fae057 100644 --- a/src/main/cpp/jllama.cpp +++ b/src/main/cpp/jllama.cpp @@ -14,6 +14,7 @@ #include "server-context.h" #include "server-queue.h" #include "server-task.h" +#include "server-schema.h" #include "server-common.h" #include "server-chat.h" #include "utils.hpp" @@ -200,7 +201,7 @@ static void populate_completion_task(server_task &task, jllama_context *jctx, in if (!tokenized_prompts.empty()) { task.tokens = std::move(tokenized_prompts[0]); } - task.params = server_task::params_from_json_cmpl(jctx->vocab, jctx->params, n_ctx_slot, logit_bias_eog, data); + task.params = server_schema::eval_llama_cmpl_schema(jctx->vocab, jctx->params, n_ctx_slot, logit_bias_eog, data); } [[nodiscard]] static jint dispatch_streaming_completion(JNIEnv *env, jllama_context *jctx, const json &data, diff --git a/src/main/cpp/jni_helpers.hpp b/src/main/cpp/jni_helpers.hpp index 7de6e154..57895d1e 100644 --- a/src/main/cpp/jni_helpers.hpp +++ b/src/main/cpp/jni_helpers.hpp @@ -61,7 +61,7 @@ struct jllama_context { llama_model *vocab_only_model = nullptr; // Saved copy of common_params used to load the model. - // Required by server_task::params_from_json_cmpl which takes common_params&. + // Required by server_schema::eval_llama_cmpl_schema which takes common_params&. common_params params; // Per-streaming-task response readers, keyed by task id. diff --git a/src/test/cpp/test_server.cpp b/src/test/cpp/test_server.cpp index a25ab965..50179efd 100644 --- a/src/test/cpp/test_server.cpp +++ b/src/test/cpp/test_server.cpp @@ -23,6 +23,7 @@ #include "server-context.h" #include "server-queue.h" #include "server-task.h" +#include "server-schema.h" #include "server-common.h" #include "server-chat.h" #include "utils.hpp" @@ -1719,7 +1720,7 @@ namespace { task_params parse_params(const json &data, int n_ctx = 512) { common_params params_base; std::vector no_bias; - return server_task::params_from_json_cmpl(nullptr, params_base, n_ctx, no_bias, data); + return server_schema::eval_llama_cmpl_schema(nullptr, params_base, n_ctx, no_bias, data); } } // namespace