Skip to content

Commit af13cf0

Browse files
Merge pull request #240 from bernardladenthin/claude/cool-hypatia-m7kcu3
Add OpenAI-compatible HTTP endpoint for local model serving
2 parents 59abd79 + 6e99a3a commit af13cf0

36 files changed

Lines changed: 2980 additions & 813 deletions

.clang-format

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -175,7 +175,10 @@ RequiresClausePosition: OwnLine
175175
RequiresExpressionIndentation: OuterScope
176176
SeparateDefinitionBlocks: Leave
177177
ShortNamespaceLines: 1
178-
SortIncludes: CaseSensitive
178+
# Never reorder #include lines: this project has order-sensitive includes — the upstream
179+
# server-*.h headers must precede json_helpers.hpp / jni_helpers.hpp (which use the `json`
180+
# alias those headers define). Alphabetical sorting breaks the build (json undefined).
181+
SortIncludes: Never
179182
SortJavaStaticImport: Before
180183
SortUsingDeclarations: LexicographicNumeric
181184
SpaceAfterCStyleCast: false

.github/workflows/clang-format.yml

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
# SPDX-FileCopyrightText: 2026 Bernard Ladenthin <bernard.ladenthin@gmail.com>
2+
#
3+
# SPDX-License-Identifier: MIT
4+
5+
name: clang-format
6+
on:
7+
push:
8+
pull_request:
9+
workflow_dispatch:
10+
11+
# Enforces a single, pinned clang-format across all C++ sources so formatting is
12+
# reproducible between contributors and CI. Bump CLANG_FORMAT_VERSION here and in
13+
# CLAUDE.md (Code Formatting) together, then reformat the tree with the same version.
14+
env:
15+
CLANG_FORMAT_VERSION: "22.1.5"
16+
17+
jobs:
18+
clang-format:
19+
runs-on: ubuntu-latest
20+
steps:
21+
- uses: actions/checkout@v6
22+
- uses: actions/setup-python@v5
23+
with:
24+
python-version: "3.x"
25+
- name: Install pinned clang-format
26+
run: pip install "clang-format==${CLANG_FORMAT_VERSION}"
27+
- name: Check C++ formatting
28+
run: |
29+
clang-format --version
30+
# All hand-written C++ sources; the generated JNI header (src/main/cpp/jllama.h,
31+
# produced by `javac -h`) is intentionally excluded.
32+
files=$(find src/main/cpp src/test/cpp -type f \( -name '*.cpp' -o -name '*.hpp' \) | sort)
33+
echo "Checking:"; echo "$files"
34+
clang-format --dry-run --Werror $files

.github/workflows/publish.yml

Lines changed: 31 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -398,19 +398,19 @@ jobs:
398398
name: Linux-x86_64-libraries
399399
path: ${{ github.workspace }}/src/main/resources/net/ladenthin/llama/
400400
- name: Download text generation model
401-
run: curl -L --fail ${MODEL_URL} --create-dirs -o models/${MODEL_NAME}
401+
run: curl -L --fail --retry 5 --retry-all-errors ${MODEL_URL} --create-dirs -o models/${MODEL_NAME}
402402
- name: Download reranking model
403-
run: curl -L --fail ${RERANKING_MODEL_URL} --create-dirs -o models/${RERANKING_MODEL_NAME}
403+
run: curl -L --fail --retry 5 --retry-all-errors ${RERANKING_MODEL_URL} --create-dirs -o models/${RERANKING_MODEL_NAME}
404404
- name: Download draft model
405-
run: curl -L --fail ${DRAFT_MODEL_URL} --create-dirs -o models/${DRAFT_MODEL_NAME}
405+
run: curl -L --fail --retry 5 --retry-all-errors ${DRAFT_MODEL_URL} --create-dirs -o models/${DRAFT_MODEL_NAME}
406406
- name: Download reasoning model
407-
run: curl -L --fail ${REASONING_MODEL_URL} --create-dirs -o models/${REASONING_MODEL_NAME}
407+
run: curl -L --fail --retry 5 --retry-all-errors ${REASONING_MODEL_URL} --create-dirs -o models/${REASONING_MODEL_NAME}
408408
- name: Download nomic embedding model (issue #98 regression)
409-
run: curl -L --fail ${NOMIC_EMBED_MODEL_URL} --create-dirs -o models/${NOMIC_EMBED_MODEL_NAME}
409+
run: curl -L --fail --retry 5 --retry-all-errors ${NOMIC_EMBED_MODEL_URL} --create-dirs -o models/${NOMIC_EMBED_MODEL_NAME}
410410
- name: Download vision model (issues #103 / #34)
411-
run: curl -L --fail ${VISION_MODEL_URL} --create-dirs -o models/${VISION_MODEL_NAME}
411+
run: curl -L --fail --retry 5 --retry-all-errors ${VISION_MODEL_URL} --create-dirs -o models/${VISION_MODEL_NAME}
412412
- name: Download vision mmproj
413-
run: curl -L --fail ${VISION_MMPROJ_URL} --create-dirs -o models/${VISION_MMPROJ_NAME}
413+
run: curl -L --fail --retry 5 --retry-all-errors ${VISION_MMPROJ_URL} --create-dirs -o models/${VISION_MMPROJ_NAME}
414414
- name: List files in models directory
415415
run: ls -l models/
416416
- name: Validate model files
@@ -519,17 +519,17 @@ jobs:
519519
name: macos-14-libraries
520520
path: ${{ github.workspace }}/src/main/resources/net/ladenthin/llama/
521521
- name: Download text generation model
522-
run: curl -L --fail ${MODEL_URL} --create-dirs -o models/${MODEL_NAME}
522+
run: curl -L --fail --retry 5 --retry-all-errors ${MODEL_URL} --create-dirs -o models/${MODEL_NAME}
523523
- name: Download reranking model
524-
run: curl -L --fail ${RERANKING_MODEL_URL} --create-dirs -o models/${RERANKING_MODEL_NAME}
524+
run: curl -L --fail --retry 5 --retry-all-errors ${RERANKING_MODEL_URL} --create-dirs -o models/${RERANKING_MODEL_NAME}
525525
- name: Download draft model
526-
run: curl -L --fail ${DRAFT_MODEL_URL} --create-dirs -o models/${DRAFT_MODEL_NAME}
526+
run: curl -L --fail --retry 5 --retry-all-errors ${DRAFT_MODEL_URL} --create-dirs -o models/${DRAFT_MODEL_NAME}
527527
- name: Download reasoning model
528-
run: curl -L --fail ${REASONING_MODEL_URL} --create-dirs -o models/${REASONING_MODEL_NAME}
528+
run: curl -L --fail --retry 5 --retry-all-errors ${REASONING_MODEL_URL} --create-dirs -o models/${REASONING_MODEL_NAME}
529529
- name: Download vision model (issues #103 / #34)
530-
run: curl -L --fail ${VISION_MODEL_URL} --create-dirs -o models/${VISION_MODEL_NAME}
530+
run: curl -L --fail --retry 5 --retry-all-errors ${VISION_MODEL_URL} --create-dirs -o models/${VISION_MODEL_NAME}
531531
- name: Download vision mmproj
532-
run: curl -L --fail ${VISION_MMPROJ_URL} --create-dirs -o models/${VISION_MMPROJ_NAME}
532+
run: curl -L --fail --retry 5 --retry-all-errors ${VISION_MMPROJ_URL} --create-dirs -o models/${VISION_MMPROJ_NAME}
533533
- name: List files in models directory
534534
run: ls -l models/
535535
- name: Validate model files
@@ -583,17 +583,17 @@ jobs:
583583
name: macos-15-libraries
584584
path: ${{ github.workspace }}/src/main/resources/net/ladenthin/llama/
585585
- name: Download text generation model
586-
run: curl -L --fail ${MODEL_URL} --create-dirs -o models/${MODEL_NAME}
586+
run: curl -L --fail --retry 5 --retry-all-errors ${MODEL_URL} --create-dirs -o models/${MODEL_NAME}
587587
- name: Download reranking model
588-
run: curl -L --fail ${RERANKING_MODEL_URL} --create-dirs -o models/${RERANKING_MODEL_NAME}
588+
run: curl -L --fail --retry 5 --retry-all-errors ${RERANKING_MODEL_URL} --create-dirs -o models/${RERANKING_MODEL_NAME}
589589
- name: Download draft model
590-
run: curl -L --fail ${DRAFT_MODEL_URL} --create-dirs -o models/${DRAFT_MODEL_NAME}
590+
run: curl -L --fail --retry 5 --retry-all-errors ${DRAFT_MODEL_URL} --create-dirs -o models/${DRAFT_MODEL_NAME}
591591
- name: Download reasoning model
592-
run: curl -L --fail ${REASONING_MODEL_URL} --create-dirs -o models/${REASONING_MODEL_NAME}
592+
run: curl -L --fail --retry 5 --retry-all-errors ${REASONING_MODEL_URL} --create-dirs -o models/${REASONING_MODEL_NAME}
593593
- name: Download vision model (issues #103 / #34)
594-
run: curl -L --fail ${VISION_MODEL_URL} --create-dirs -o models/${VISION_MODEL_NAME}
594+
run: curl -L --fail --retry 5 --retry-all-errors ${VISION_MODEL_URL} --create-dirs -o models/${VISION_MODEL_NAME}
595595
- name: Download vision mmproj
596-
run: curl -L --fail ${VISION_MMPROJ_URL} --create-dirs -o models/${VISION_MMPROJ_NAME}
596+
run: curl -L --fail --retry 5 --retry-all-errors ${VISION_MMPROJ_URL} --create-dirs -o models/${VISION_MMPROJ_NAME}
597597
- name: List files in models directory
598598
run: ls -l models/
599599
- name: Validate model files
@@ -647,17 +647,17 @@ jobs:
647647
name: macos-15-metal-libraries
648648
path: ${{ github.workspace }}/src/main/resources/net/ladenthin/llama/
649649
- name: Download text generation model
650-
run: curl -L --fail ${MODEL_URL} --create-dirs -o models/${MODEL_NAME}
650+
run: curl -L --fail --retry 5 --retry-all-errors ${MODEL_URL} --create-dirs -o models/${MODEL_NAME}
651651
- name: Download reranking model
652-
run: curl -L --fail ${RERANKING_MODEL_URL} --create-dirs -o models/${RERANKING_MODEL_NAME}
652+
run: curl -L --fail --retry 5 --retry-all-errors ${RERANKING_MODEL_URL} --create-dirs -o models/${RERANKING_MODEL_NAME}
653653
- name: Download draft model
654-
run: curl -L --fail ${DRAFT_MODEL_URL} --create-dirs -o models/${DRAFT_MODEL_NAME}
654+
run: curl -L --fail --retry 5 --retry-all-errors ${DRAFT_MODEL_URL} --create-dirs -o models/${DRAFT_MODEL_NAME}
655655
- name: Download reasoning model
656-
run: curl -L --fail ${REASONING_MODEL_URL} --create-dirs -o models/${REASONING_MODEL_NAME}
656+
run: curl -L --fail --retry 5 --retry-all-errors ${REASONING_MODEL_URL} --create-dirs -o models/${REASONING_MODEL_NAME}
657657
- name: Download vision model (issues #103 / #34)
658-
run: curl -L --fail ${VISION_MODEL_URL} --create-dirs -o models/${VISION_MODEL_NAME}
658+
run: curl -L --fail --retry 5 --retry-all-errors ${VISION_MODEL_URL} --create-dirs -o models/${VISION_MODEL_NAME}
659659
- name: Download vision mmproj
660-
run: curl -L --fail ${VISION_MMPROJ_URL} --create-dirs -o models/${VISION_MMPROJ_NAME}
660+
run: curl -L --fail --retry 5 --retry-all-errors ${VISION_MMPROJ_URL} --create-dirs -o models/${VISION_MMPROJ_NAME}
661661
- name: List files in models directory
662662
run: ls -l models/
663663
- name: Validate model files
@@ -714,17 +714,17 @@ jobs:
714714
name: Windows-x86_64-libraries
715715
path: ${{ github.workspace }}/src/main/resources/net/ladenthin/llama/
716716
- name: Download text generation model
717-
run: curl -L --fail $env:MODEL_URL --create-dirs -o models/$env:MODEL_NAME
717+
run: curl -L --fail --retry 5 --retry-all-errors $env:MODEL_URL --create-dirs -o models/$env:MODEL_NAME
718718
- name: Download reranking model
719-
run: curl -L --fail $env:RERANKING_MODEL_URL --create-dirs -o models/$env:RERANKING_MODEL_NAME
719+
run: curl -L --fail --retry 5 --retry-all-errors $env:RERANKING_MODEL_URL --create-dirs -o models/$env:RERANKING_MODEL_NAME
720720
- name: Download draft model
721-
run: curl -L --fail $env:DRAFT_MODEL_URL --create-dirs -o models/$env:DRAFT_MODEL_NAME
721+
run: curl -L --fail --retry 5 --retry-all-errors $env:DRAFT_MODEL_URL --create-dirs -o models/$env:DRAFT_MODEL_NAME
722722
- name: Download reasoning model
723-
run: curl -L --fail $env:REASONING_MODEL_URL --create-dirs -o models/$env:REASONING_MODEL_NAME
723+
run: curl -L --fail --retry 5 --retry-all-errors $env:REASONING_MODEL_URL --create-dirs -o models/$env:REASONING_MODEL_NAME
724724
- name: Download vision model (issues #103 / #34)
725-
run: curl -L --fail $env:VISION_MODEL_URL --create-dirs -o models/$env:VISION_MODEL_NAME
725+
run: curl -L --fail --retry 5 --retry-all-errors $env:VISION_MODEL_URL --create-dirs -o models/$env:VISION_MODEL_NAME
726726
- name: Download vision mmproj
727-
run: curl -L --fail $env:VISION_MMPROJ_URL --create-dirs -o models/$env:VISION_MMPROJ_NAME
727+
run: curl -L --fail --retry 5 --retry-all-errors $env:VISION_MMPROJ_URL --create-dirs -o models/$env:VISION_MMPROJ_NAME
728728
- name: List files in models directory
729729
run: ls -l models/
730730
- name: Validate model files

CLAUDE.md

Lines changed: 25 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -393,10 +393,28 @@ not track the loader's own Java package). This is the same
393393
`spotbugs-exclude.xml`, PIT `targetClasses`, and `CMakeLists.txt` OSInfo repairs.
394394

395395
### Code Formatting
396+
397+
C++ formatting is **enforced in CI** (`.github/workflows/clang-format.yml`) with a **pinned**
398+
clang-format — currently **22.1.5**, installed via `pip install clang-format==22.1.5`. Format with
399+
that exact version before committing; a different clang-format version reflows code differently and
400+
will fail the check.
401+
396402
```bash
397-
clang-format -i src/main/cpp/*.cpp src/main/cpp/*.hpp # Format C++ code
403+
pip install "clang-format==22.1.5"
404+
clang-format -i src/main/cpp/*.cpp src/main/cpp/*.hpp src/test/cpp/*.cpp # Format C++ code
398405
```
399406

407+
The generated JNI header `src/main/cpp/jllama.h` (produced by `javac -h`) is intentionally excluded.
408+
To bump the enforced version, update the pin in **both** the workflow (`CLANG_FORMAT_VERSION`) and
409+
this line, then reformat the whole tree with the new version in the same commit.
410+
411+
**`.clang-format` sets `SortIncludes: Never` — do not re-enable include sorting.** The project has
412+
order-sensitive includes (see the "Include order rule" above): the upstream `server-*.h` headers and
413+
`utils.hpp` must precede `json_helpers.hpp` / `jni_helpers.hpp`, which use the `json` alias those
414+
headers define. Alphabetical sorting moves the helper headers first and breaks the build with
415+
`'json' does not name a type` (it slips past a local build whose toolchain resolves `json` anyway,
416+
but fails the manylinux/aarch64/Android CI compilers). Keep include order manual.
417+
400418
### Javadoc — must build cleanly before `mvn package`
401419

402420
The release packaging job runs `mvn package` with the `release` profile, which attaches
@@ -453,7 +471,9 @@ If the local check passes (`BUILD SUCCESS`), the `mvn package` job in
453471
- `LlamaIterator` / `LlamaIterable` — Streaming generation via Java `Iterator`/`Iterable`.
454472
- `LlamaLoader` — Extracts the platform-specific native library from the JAR to a temp directory, or finds it on `java.library.path`.
455473
- `OSInfo` — Detects OS and architecture for library resolution.
456-
- `server.LlamaServer` — Optional OpenAI-compatible HTTP server and the fat-jar `Main-Class`. `LlamaServerArgs` parses the CLI; `OaiRouter` / `OaiHttpServer` (NanoHTTPD) map `POST /v1/chat/completions`, `/v1/completions`, `/v1/embeddings` and `GET /v1/models` to the `LlamaModel.handle*` methods. NanoHTTPD is an `<optional>` dependency (bundled only in the fat jar, not inherited by library consumers). The `server` package is a dedicated top layer in the ArchUnit `layeredArchitecture` rule (the only layer allowed to access the root `Api`). See README "OpenAI-compatible HTTP server".
474+
- **`server` package — OpenAI-compatible HTTP endpoint. NOTE: two implementations coexist on this branch pending a "best of both" consolidation (see [`TODO.md`](TODO.md)).**
475+
- `server.OpenAiCompatServer` — built on the JDK's `com.sun.net.httpserver` (no new dependency). Serves `POST /v1/chat/completions` (streaming via SSE + non-streaming) and `GET /v1/models` by delegating to `LlamaModel.chatComplete` / `LlamaModel.streamChatCompletion`, so editors that speak the OpenAI protocol (e.g. VS Code Copilot "Custom Endpoint") can drive a local model. Streaming uses the native OAI chunk path (`requestChatCompletionStream` / `receiveChatCompletionChunk`), preserving `delta.tool_calls`.
476+
- `server.LlamaServer` — an OpenAI-compatible HTTP server and the fat-jar `Main-Class`. `LlamaServerArgs` parses the CLI; `OaiRouter` / `OaiHttpServer` (NanoHTTPD) map `POST /v1/chat/completions`, `/v1/completions`, `/v1/embeddings` and `GET /v1/models` to the `LlamaModel.handle*` methods. NanoHTTPD is an `<optional>` dependency (bundled only in the fat jar, not inherited by library consumers). The `server` package is a dedicated top layer in the ArchUnit `layeredArchitecture` rule (the only layer allowed to access the root `Api`). See README "OpenAI-compatible HTTP server".
457477

458478
**Native layer** (`src/main/cpp/`):
459479
- `jllama.cpp` — JNI implementation bridging Java calls to llama.cpp. ~1,215 lines; 17 native methods.
@@ -478,7 +498,7 @@ The project C++ helpers follow a strict semantic split:
478498

479499
Functions: `get_result_error_message`, `results_to_json`, `rerank_results_to_json`,
480500
`parse_encoding_format`, `extract_embedding_prompt`, `is_infill_request`,
481-
`parse_slot_prompt_similarity`, `parse_positive_int_config`.
501+
`parse_slot_prompt_similarity`, `parse_positive_int_config`, `wrap_stream_chunk`.
482502

483503
**`log_helpers.hpp`** — Pure log-formatting transforms.
484504
- Input: `ggml_log_level`, message text (`const char*`), an explicit `std::time_t` timestamp.
@@ -584,11 +604,11 @@ ctest --test-dir build --output-on-failure -R "ResultsToJson"
584604
|------|-------|-------|
585605
| `src/test/cpp/test_utils.cpp` | 156 | Upstream helpers: `server_tokens`, `server_grammar_trigger`, `gen_tool_call_id`, `json_value`, `json_get_nested_values`, UTF-8 helpers, `format_response_rerank`, `format_embeddings_response_oaicompat`, `oaicompat_completion_params_parse`, `oaicompat_chat_params_parse`, `are_lora_equal`, `strip_flag_from_argv`, `token_piece_value`, `json_is_array_and_contains_numbers`, `format_oai_sse`, `format_oai_resp_sse`, `format_anthropic_sse` |
586606
| `src/test/cpp/test_server.cpp` | 188 | Upstream result types: `result_timings`, `task_params::to_json()` (incl. `dry_sequence_breakers`, `preserved_tokens`, `timings_per_token`), `completion_token_output`, `server_task_result_cmpl_partial` (non-oaicompat + `to_json_oaicompat` + logprobs + `to_json_oaicompat_chat` + `to_json_anthropic` + dispatcher), `server_task_result_cmpl_final` (non-oaicompat + `to_json_oaicompat` + `to_json_oaicompat_chat` + `to_json_oaicompat_chat_stream` + `to_json_anthropic` + `to_json_anthropic_stream` + tool_calls + dispatcher), `server_task_result_embd`, `server_task_result_rerank`, `server_task_result_metrics`, `server_task_result_slot_save_load`, `server_task_result_slot_erase`, `server_task_result_apply_lora`, `server_task_result_error`, `format_error_response`, `server_task::need_sampling()`, `server_task::n_tokens()`, `server_task::params_from_json_cmpl()` (parsing pipeline + grammar routing + error paths), `response_fields` projection |
587-
| `src/test/cpp/test_json_helpers.cpp` | 42 | All functions in `json_helpers.hpp`: `get_result_error_message`, `results_to_json`, `rerank_results_to_json`, `parse_encoding_format`, `extract_embedding_prompt`, `is_infill_request`, `parse_slot_prompt_similarity`, `parse_positive_int_config` |
607+
| `src/test/cpp/test_json_helpers.cpp` | 47 | All functions in `json_helpers.hpp`: `get_result_error_message`, `results_to_json`, `rerank_results_to_json`, `parse_encoding_format`, `extract_embedding_prompt`, `is_infill_request`, `parse_slot_prompt_similarity`, `parse_positive_int_config`, `wrap_stream_chunk` |
588608
| `src/test/cpp/test_log_helpers.cpp` | 13 | All functions in `log_helpers.hpp`: `log_level_name`, `format_log_as_json` |
589609
| `src/test/cpp/test_jni_helpers.cpp` | 41 | All functions in `jni_helpers.hpp` using a zero-filled `JNINativeInterface_` mock |
590610

591-
**Current total: 440 tests (all passing).**
611+
**Current total: 445 tests (all passing).**
592612

593613
#### Upstream source location (in CMake build tree)
594614

CMakeLists.txt

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -355,5 +355,14 @@ if(BUILD_TESTING)
355355
SERVER_VERBOSE=$<BOOL:${LLAMA_VERBOSE}>
356356
)
357357

358-
gtest_discover_tests(jllama_test)
358+
# gtest_discover_tests runs the freshly built jllama_test executable at build
359+
# time (POST_BUILD) to enumerate test cases. The default discovery timeout is
360+
# 5s. The 32-bit Windows (Win32) build links the entire llama/ggml/server tree
361+
# statically into one large binary whose startup + test enumeration sits right
362+
# at that 5s boundary on shared CI runners: the same b9682 binary discovered
363+
# within 5s in one run but was killed at the 5s timeout in another (empty
364+
# output, process still alive — a timeout, not a crash). x64/Linux/macOS finish
365+
# well under the default. Raise the budget so 32-bit discovery is not flaky;
366+
# this is a maximum, so fast platforms still return immediately.
367+
gtest_discover_tests(jllama_test DISCOVERY_TIMEOUT 120)
359368
endif()

0 commit comments

Comments
 (0)