Skip to content

Commit 6a80e23

Browse files
authored
feat(middleware): Model routing, PII filtering, Cloud model proxies (#9802)
Add a routing middleware stack and a cloud-proxy backend. * cloud-proxy: a Go gRPC backend that forwards OpenAI- and Anthropic-shaped chat requests to upstream providers, with an optional translate mode (OpenAI request -> Anthropic /v1/messages -> OpenAI response) and full tool-calling support. * routing: admission control, content-aware model routing (embedding cache + classifier + rerank + Arch-Router score), PII detection/redaction (regex + NER) with streaming filter and OpenAI/Anthropic adapters, and a per-user/per-key billing recorder backed by GORM or in-memory storage. * middleware: UsageMiddleware records usage via the billing recorder, plus admission, route-model, usage-stamp and trace middlewares. * observability: BackendTrace ring buffer stores full request bodies (capped), MITM proxy emits structured trace events, and router classifier decisions surface at /api/router/decide. * gallery: Arch-Router-1.5B (Q4_K_M and Q8_0). * UI: cloud-proxy model-editor fields, classifier system-prompt and score-normalization config, and a Traces page rendering request bodies. Assisted-by: claude-code:claude-opus-4-7 [Read] [Edit] [Bash] Signed-off-by: Richard Palethorpe <io@richiejp.com>
1 parent 1dcd1ae commit 6a80e23

229 files changed

Lines changed: 26324 additions & 1015 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.dockerignore

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
.devcontainer
55
models
66
backends
7+
volumes
78
examples/chatbot-ui/models
89
backend/go/image/stablediffusion-ggml/build/
910
backend/go/*/build
@@ -21,3 +22,11 @@ __pycache__
2122
# backend virtual environments
2223
**/venv
2324
backend/python/**/source
25+
26+
# In-place llama.cpp clone + per-variant build copies. The Makefile
27+
# clones llama.cpp itself at the pinned LLAMA_VERSION; if a stale
28+
# local checkout is COPY'd into the image, the `llama.cpp:` target
29+
# sees the directory and skips re-cloning, so grpc-server.cpp ends
30+
# up compiled against whatever (likely older) commit the host had.
31+
backend/cpp/llama-cpp/llama.cpp
32+
backend/cpp/llama-cpp-*-build

.gitignore

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,10 @@ go-bert
2626
LocalAI
2727
/local-ai
2828
/local-ai-launcher
29+
# Root-level build artifacts when running `go build ./...` against
30+
# Go backend packages whose main lives under backend/go/.
31+
/cloud-proxy
32+
/local-store
2933
# prevent above rules from omitting the helm chart
3034
!charts/*
3135
# prevent above rules from omitting the api/localai folder

Makefile

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@ else
6969
GORELEASER=$(shell which goreleaser)
7070
endif
7171

72-
TEST_PATHS?=./api/... ./pkg/... ./core/...
72+
TEST_PATHS?=./api/... ./pkg/... ./core/... ./backend/go/cloud-proxy/... ./backend/go/local-store/...
7373

7474

7575
.PHONY: all test build vendor lint lint-all
@@ -268,12 +268,13 @@ prepare-e2e:
268268
run-e2e-image:
269269
docker run -p 5390:8080 -e MODELS_PATH=/models -e THREADS=1 -e DEBUG=true -d --rm -v $(TEST_DIR):/models --name e2e-tests-$(RANDOM) localai-tests
270270

271-
test-e2e: build-mock-backend prepare-e2e run-e2e-image
271+
test-e2e: build-mock-backend build-cloud-proxy-backend prepare-e2e run-e2e-image
272272
@echo 'Running e2e tests'
273273
BUILD_TYPE=$(BUILD_TYPE) \
274274
LOCALAI_API=http://$(E2E_BRIDGE_IP):5390 \
275275
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --flake-attempts $(TEST_FLAKES) -v -r ./tests/e2e
276276
$(MAKE) clean-mock-backend
277+
$(MAKE) clean-cloud-proxy-backend
277278
$(MAKE) teardown-e2e
278279
docker rmi localai-tests
279280

@@ -1064,6 +1065,7 @@ BACKEND_DS4 = ds4|ds4|.|false|false
10641065
# Golang backends
10651066
BACKEND_PIPER = piper|golang|.|false|true
10661067
BACKEND_LOCAL_STORE = local-store|golang|.|false|true
1068+
BACKEND_CLOUD_PROXY = cloud-proxy|golang|.|false|true
10671069
BACKEND_HUGGINGFACE = huggingface|golang|.|false|true
10681070
BACKEND_SILERO_VAD = silero-vad|golang|.|false|true
10691071
BACKEND_STABLEDIFFUSION_GGML = stablediffusion-ggml|golang|.|--progress=plain|true
@@ -1149,6 +1151,7 @@ $(eval $(call generate-docker-build-target,$(BACKEND_TURBOQUANT)))
11491151
$(eval $(call generate-docker-build-target,$(BACKEND_DS4)))
11501152
$(eval $(call generate-docker-build-target,$(BACKEND_PIPER)))
11511153
$(eval $(call generate-docker-build-target,$(BACKEND_LOCAL_STORE)))
1154+
$(eval $(call generate-docker-build-target,$(BACKEND_CLOUD_PROXY)))
11521155
$(eval $(call generate-docker-build-target,$(BACKEND_HUGGINGFACE)))
11531156
$(eval $(call generate-docker-build-target,$(BACKEND_SILERO_VAD)))
11541157
$(eval $(call generate-docker-build-target,$(BACKEND_STABLEDIFFUSION_GGML)))
@@ -1201,7 +1204,7 @@ $(eval $(call generate-docker-build-target,$(BACKEND_SHERPA_ONNX)))
12011204
docker-save-%: backend-images
12021205
docker save local-ai-backend:$* -o backend-images/$*.tar
12031206

1204-
docker-build-backends: docker-build-llama-cpp docker-build-ik-llama-cpp docker-build-turboquant docker-build-ds4 docker-build-rerankers docker-build-vllm docker-build-vllm-omni docker-build-sglang docker-build-transformers docker-build-outetts docker-build-diffusers docker-build-kokoro docker-build-faster-whisper docker-build-coqui docker-build-chatterbox docker-build-vibevoice docker-build-liquid-audio docker-build-moonshine docker-build-pocket-tts docker-build-qwen-tts docker-build-fish-speech docker-build-faster-qwen3-tts docker-build-qwen-asr docker-build-nemo docker-build-voxcpm docker-build-whisperx docker-build-ace-step docker-build-acestep-cpp docker-build-voxtral docker-build-mlx-distributed docker-build-trl docker-build-llama-cpp-quantization docker-build-tinygrad docker-build-kokoros docker-build-sam3-cpp docker-build-qwen3-tts-cpp docker-build-vibevoice-cpp docker-build-localvqe docker-build-insightface docker-build-speaker-recognition docker-build-sherpa-onnx
1207+
docker-build-backends: docker-build-llama-cpp docker-build-ik-llama-cpp docker-build-turboquant docker-build-ds4 docker-build-rerankers docker-build-vllm docker-build-vllm-omni docker-build-sglang docker-build-transformers docker-build-outetts docker-build-diffusers docker-build-kokoro docker-build-faster-whisper docker-build-coqui docker-build-chatterbox docker-build-vibevoice docker-build-liquid-audio docker-build-moonshine docker-build-pocket-tts docker-build-qwen-tts docker-build-fish-speech docker-build-faster-qwen3-tts docker-build-qwen-asr docker-build-nemo docker-build-voxcpm docker-build-whisperx docker-build-ace-step docker-build-acestep-cpp docker-build-voxtral docker-build-mlx-distributed docker-build-trl docker-build-llama-cpp-quantization docker-build-tinygrad docker-build-kokoros docker-build-sam3-cpp docker-build-qwen3-tts-cpp docker-build-vibevoice-cpp docker-build-localvqe docker-build-insightface docker-build-speaker-recognition docker-build-sherpa-onnx docker-build-cloud-proxy
12051208

12061209
########################################################
12071210
### Mock Backend for E2E Tests
@@ -1213,6 +1216,12 @@ build-mock-backend: protogen-go
12131216
clean-mock-backend:
12141217
rm -f tests/e2e/mock-backend/mock-backend
12151218

1219+
build-cloud-proxy-backend: protogen-go
1220+
$(GOCMD) build -o tests/e2e/mock-backend/cloud-proxy ./backend/go/cloud-proxy
1221+
1222+
clean-cloud-proxy-backend:
1223+
rm -f tests/e2e/mock-backend/cloud-proxy
1224+
12161225
########################################################
12171226
### UI E2E Test Server
12181227
########################################################

backend/backend.proto

Lines changed: 151 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,22 @@ service Backend {
3737

3838
rpc Rerank(RerankRequest) returns (RerankResult) {}
3939

40+
// TokenClassify runs a token-classification (NER) model on the
41+
// supplied text and returns each detected entity span. Used by the
42+
// PII redactor's optional NER tier — the regex tier still handles
43+
// formatted hits cheaply, while this catches names, locations, and
44+
// other unformatted PII that regex misses.
45+
rpc TokenClassify(TokenClassifyRequest) returns (TokenClassifyResponse) {}
46+
47+
// Score evaluates the model's joint log-probability of each
48+
// supplied candidate continuation given a shared prompt. The
49+
// prompt's KV cache is computed once and reused across candidates.
50+
// Used for routing-policy multi-label classification, reranking,
51+
// calibrated confidence, and reward-model scoring — any task where
52+
// the consumer wants the model's confidence in a pre-specified
53+
// continuation rather than a generated one.
54+
rpc Score(ScoreRequest) returns (ScoreResponse) {}
55+
4056
rpc GetMetrics(MetricsRequest) returns (MetricsResponse);
4157

4258
rpc VAD(VADRequest) returns (VADResponse) {}
@@ -68,6 +84,23 @@ service Backend {
6884
rpc QuantizationProgress(QuantizationProgressRequest) returns (stream QuantizationProgressUpdate) {}
6985
rpc StopQuantization(QuantizationStopRequest) returns (Result) {}
7086

87+
// Forward proxies a raw HTTP request to an upstream provider. The
88+
// cloud-proxy backend implements this for passthrough-mode model
89+
// configs: the client wire format is preserved end-to-end (no
90+
// translation through internal proto), which means new provider
91+
// fields work the day they ship. Translation-mode proxies use the
92+
// standard Predict/PredictStream RPCs instead. Backends that don't
93+
// support this return UNIMPLEMENTED.
94+
//
95+
// The request is bidirectionally streamed so large bodies can flow
96+
// without buffering. In practice the first ForwardRequest carries
97+
// path, method, headers, and the initial body chunk; subsequent
98+
// messages append body chunks. The first ForwardReply carries the
99+
// upstream status and response headers; subsequent messages stream
100+
// body chunks (SSE frames or chunked transfer). Cancellation of the
101+
// gRPC context closes the upstream connection.
102+
rpc Forward(stream ForwardRequest) returns (stream ForwardReply) {}
103+
71104
}
72105

73106
// Define the empty request
@@ -81,6 +114,76 @@ message MetricsResponse {
81114
int32 prompt_tokens_processed = 5;
82115
}
83116

117+
// TokenClassifyRequest carries the text to classify plus an optional
118+
// score threshold. The transformers backend interprets threshold as
119+
// the minimum confidence to include in the response; 0 = include all.
120+
message TokenClassifyRequest {
121+
string text = 1;
122+
float threshold = 2;
123+
}
124+
125+
// TokenClassifyEntity is one detected entity span. Byte offsets are
126+
// into the original UTF-8 text — start..end is a half-open range that
127+
// addresses the substring corresponding to entity_group.
128+
//
129+
// entity_group follows HuggingFace's aggregated-tag convention (e.g.
130+
// "PER", "LOC", "ORG", or a PII-specific label like "EMAIL" /
131+
// "SSN" depending on the model). The redactor's per-pattern action
132+
// map keys off this string.
133+
message TokenClassifyEntity {
134+
string entity_group = 1;
135+
int32 start = 2;
136+
int32 end = 3;
137+
float score = 4;
138+
string text = 5;
139+
}
140+
141+
message TokenClassifyResponse {
142+
repeated TokenClassifyEntity entities = 1;
143+
}
144+
145+
// ScoreRequest carries one shared prompt and one or more continuations
146+
// to score against it. The backend tokenises the prompt once and reuses
147+
// the resulting KV cache across all candidates in this request.
148+
message ScoreRequest {
149+
string prompt = 1;
150+
repeated string candidates = 2;
151+
// Return per-token logprobs for each candidate when true. Default
152+
// false to keep the wire response small; the joint log_prob field
153+
// covers the common ranking case.
154+
bool include_token_logprobs = 3;
155+
// When true, the response also populates length_normalized_log_prob
156+
// (joint log-prob divided by candidate token count). Useful when
157+
// candidates differ in length and the consumer wants a per-token
158+
// measure comparable across them (PMI-style scoring).
159+
bool length_normalize = 4;
160+
}
161+
162+
// CandidateScore is one row in the ScoreResponse, matching by index
163+
// the candidate in ScoreRequest.candidates.
164+
message CandidateScore {
165+
// Sum of log P(token_i | prompt, candidate_token_<i) across the
166+
// candidate's tokens. The primary ranking signal.
167+
double log_prob = 1;
168+
// log_prob / num_tokens — populated when length_normalize=true on
169+
// the request.
170+
double length_normalized_log_prob = 2;
171+
// Per-token detail — populated when include_token_logprobs=true.
172+
repeated TokenLogProb tokens = 3;
173+
// Number of tokens the backend tokenised this candidate into, after
174+
// any backend-specific normalisation (e.g. leading-space handling).
175+
int32 num_tokens = 4;
176+
}
177+
178+
message TokenLogProb {
179+
string token = 1;
180+
double log_prob = 2;
181+
}
182+
183+
message ScoreResponse {
184+
repeated CandidateScore candidates = 1;
185+
}
186+
84187
message RerankRequest {
85188
string query = 1;
86189
repeated string documents = 2;
@@ -325,6 +428,25 @@ message ModelOptions {
325428
// applied verbatim to the backend's engine constructor (e.g. vLLM AsyncEngineArgs).
326429
// Unknown keys produce an error at LoadModel time.
327430
string EngineArgs = 73;
431+
432+
// Proxy carries the cloud-proxy backend's per-model configuration.
433+
// Empty for non-proxy backends.
434+
ProxyOptions Proxy = 74;
435+
}
436+
437+
// ProxyOptions configures the cloud-proxy backend. UpstreamURL and
438+
// Mode are always meaningful; Provider only matters in translate mode.
439+
// The two api_key_* fields are mutually exclusive and resolved by the
440+
// backend at LoadModel — core forwards the references rather than the
441+
// plaintext key.
442+
message ProxyOptions {
443+
string upstream_url = 1;
444+
string mode = 2;
445+
string provider = 3;
446+
string api_key_env = 4;
447+
string api_key_file = 5;
448+
string upstream_model = 6;
449+
int32 request_timeout_seconds = 7;
328450
}
329451

330452
message Result {
@@ -1002,3 +1124,32 @@ message QuantizationStopRequest {
10021124
string job_id = 1;
10031125
}
10041126

1127+
// ForwardHeader is one HTTP header on the request or response. Headers
1128+
// like Authorization are typically injected by the backend (from the
1129+
// resolved API key) rather than passed through from the client.
1130+
message ForwardHeader {
1131+
string name = 1;
1132+
string value = 2;
1133+
}
1134+
1135+
// ForwardRequest is a streamed HTTP request to the upstream. First
1136+
// message carries path/method/headers; subsequent messages carry
1137+
// body_chunk only. All fields except body_chunk are honoured on the
1138+
// first message and ignored thereafter.
1139+
message ForwardRequest {
1140+
string path = 1; // e.g. "/v1/chat/completions" — appended to the model's upstream_url
1141+
string method = 2; // usually "POST"
1142+
repeated ForwardHeader headers = 3;
1143+
bytes body_chunk = 4;
1144+
}
1145+
1146+
// ForwardReply is a streamed HTTP response from the upstream. First
1147+
// message carries status/headers; subsequent messages carry body_chunk
1148+
// only. SSE responses arrive as a sequence of body_chunk frames; the
1149+
// caller is responsible for any parsing.
1150+
message ForwardReply {
1151+
int32 status = 1;
1152+
repeated ForwardHeader headers = 2;
1153+
bytes body_chunk = 3;
1154+
}
1155+

0 commit comments

Comments
 (0)