llama-swap/config.example.yaml at main · codeanker/llama-swap · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
# add this modeline for validation in vscode
# yaml-language-server: $schema=https://raw.githubusercontent.com/mostlygeek/llama-swap/refs/heads/main/config-schema.json
#
# llama-swap YAML configuration example
# -------------------------------------
#
# 💡 Tip - Use an LLM with this file!
# ====================================
#  This example configuration is written to be LLM friendly. Try
#  copying this file into an LLM and asking it to explain or generate
#  sections for you.
# ====================================

# Usage notes:
# - Below are all the available configuration options for llama-swap.
# - Settings noted as "required" must be in your configuration file
# - Settings noted as "optional" can be omitted

# healthCheckTimeout: number of seconds to wait for a model to be ready to serve requests
# - optional, default: 120
# - minimum value is 15 seconds, anything less will be set to this value
healthCheckTimeout: 500

# logLevel: sets the logging value
# - optional, default: info
# - Valid log levels: debug, info, warn, error
logLevel: info

# logTimeFormat: enables and sets the logging timestamp format
# - optional, default (disabled): ""
# - Valid values: "", "ansic", "unixdate", "rubydate", "rfc822", "rfc822z",
#   "rfc850", "rfc1123", "rfc1123z", "rfc3339", "rfc3339nano", "kitchen",
#   "stamp", "stampmilli", "stampmicro", and "stampnano".
# - For more info, read: https://pkg.go.dev/time#pkg-constants
logTimeFormat: ""

# logToStdout: controls what is logged to stdout
# - optional, default: "proxy"
# - valid values:
#   - "proxy": logs generated by llama-swap when swapping models,
#      handling requests, etc.
#   - "upstream": a copy of an upstream processes stdout logs
#   - "both": both the proxy and upstream logs interleaved together
#   - "none": no logs are ever written to stdout
logToStdout: "proxy"

# metricsMaxInMemory: maximum number of metrics to keep in memory
# - optional, default: 1000
# - controls how many metrics are stored in memory before older ones are discarded
# - useful for limiting memory usage when processing large volumes of metrics
metricsMaxInMemory: 1000

# captureBuffer: how many MBs to allocate for storing request/response captures
# - optional, default: 10
# - set to 0 to disable
captureBuffer: 15

# startPort: sets the starting port number for the automatic ${PORT} macro.
# - optional, default: 5800
# - the ${PORT} macro can be used in model.cmd and model.proxy settings
# - it is automatically incremented for every model that uses it
startPort: 10001

# sendLoadingState: inject loading status updates into the reasoning (thinking)
# field
# - optional, default: false
# - when true, a stream of loading messages will be sent to the client in the
#   reasoning field so chat UIs can show that loading is in progress.
# - see #366 for more details
sendLoadingState: true

# includeAliasesInList: present aliases within the /v1/models OpenAI API listing
# - optional, default: false
# - when true, model aliases will be output to the API model listing duplicating
#   all fields except for Id so chat UIs can use the alias equivalent to the original.
includeAliasesInList: false

# globalTTL: the default TTL in seconds before unloading a model
# - optional, default: 0 (never automatically unload)
# - must be >= 0
globalTTL: 0

# macros: a dictionary of string substitutions
# - optional, default: empty dictionary
# - macros are reusable snippets
# - used in a model's cmd, cmdStop, proxy, checkEndpoint, filters.stripParams
# - useful for reducing common configuration settings
# - macro names are strings and must be less than 64 characters
# - macro names must match the regex ^[a-zA-Z0-9_-]+$
# - macro names must not be a reserved name: PORT or MODEL_ID
# - macro values can be numbers, bools, or strings
# - macros can contain other macros, but they must be defined before they are used
# - environment variables can be referenced with ${env.VAR_NAME} syntax
#   - env macros are substituted first, before regular macros
#   - if the env var is not set, config loading will fail with an error
macros:
  # Example of a multi-line macro
  "latest-llama": >
    /path/to/llama-server/llama-server-ec9e0301
    --port ${PORT}

  "default_ctx": 4096

  # Example of macro-in-macro usage. macros can contain other macros
  # but they must be previously declared.
  "default_args": "--ctx-size ${default_ctx}"

  # Example of environment variable macros
  # - ${env.VAR_NAME} pulls the value from the system environment
  # - useful for paths, secrets, or machine-specific configuration
  "models_dir": "${env.HOME}/models"

# apiKeys: require an API key when making requests to inference endpoints
# - optional, default: []
# - when empty (the default) authorization will not be checked as llama-swap is default-allow
# - each key is a non-empty string
apiKeys:
  - "sk-hunter2"
  # tip, one liner: printf "sk-%s\n" "$(head -c 48 /dev/urandom | base64 )"
  - "sk-gyCPiKUcIfPlaM4OSMZekkprgijPx6+OsmQs8Rsg0xZ9qpy6gKWsIKqHOk+cgXVx"

  # use environment variable macros to keep secrets out of the config
  - "${env.API_KEY_1}"
  - "${env.API_KEY_2}"

# models: a dictionary of model configurations
# - required
# - each key is the model's ID, used in API requests
# - model settings have default values that are used if they are not defined here
# - the model's ID is available in the ${MODEL_ID} macro, also available in macros defined above
# - below are examples of the all the settings a model can have
models:
  # keys are the model names used in API requests
  "gpt-oss-120b":
    # macros: a dictionary of string substitutions specific to this model
    # - optional, default: empty dictionary
    # - macros defined here override macros defined in the global macros section
    # - model level macros follow the same rules as global macros
    macros:
      "default_ctx": 16384
      "temp": 0.7

    # cmd: the command to run to start the inference server.
    # - required
    # - it is just a string, similar to what you would run on the CLI
    # - using `|` allows for comments in the command, these will be parsed out
    # - macros can be used within cmd
    cmd: |
      # ${latest-llama} is a macro that is defined above
      ${latest-llama}
      --model path/to/gpt-oss-120B.gguf
      --ctx-size ${default_ctx}
      --temperature ${temp}

    # name: a display name for the model
    # - optional, default: empty string
    # - if set, it will be used in the v1/models API response
    # - if not set, it will be omitted in the JSON model record
    name: "gpt-oss 120B"

    # description: a description for the model
    # - optional, default: empty string
    # - if set, it will be used in the v1/models API response
    # - if not set, it will be omitted in the JSON model record
    description: "A thinking model from OpenAI"

    # env: define an array of environment variables to inject into cmd's environment
    # - optional, default: empty array
    # - each value is a single string
    # - in the format: ENV_NAME=value
    env:
      - "CUDA_VISIBLE_DEVICES=0,1,2"

    # proxy: the URL where llama-swap routes API requests
    # - optional, default: http://localhost:${PORT}
    # - if you used ${PORT} in cmd this can be omitted
    # - if you use a custom port in cmd this *must* be set
    proxy: http://127.0.0.1:8999

    # checkEndpoint: URL path to check if the server is ready
    # - optional, default: /health
    # - endpoint is expected to return an HTTP 200 response
    # - all requests wait until the endpoint is ready or fails
    # - use "none" to skip endpoint health checking
    checkEndpoint: /custom-endpoint

    # ttl: automatically unload the model after ttl seconds
    # - optional, default: -1 (use global default)
    # - ttl values must be a value greater than or equal to 0
    # - a ttl of -1 will use the global TTL value as the default
    # - a ttl of 0 will mean never unload
    # - a value of 0 disables automatic unloading of the model
    ttl: 60

    # useModelName: override the model name that is sent to upstream server
    # - optional, default: ""
    # - useful for when the upstream server expects a specific model name that
    #   is different from the model's ID
    useModelName: "openai/gpt-oss-120B"

    # filters: a dictionary of filter settings
    # - optional, default: empty dictionary
    # - same capabilities as peer filters (stripParams, setParams)
    filters:
      # stripParams: a comma separated list of parameters to remove from the request
      # - optional, default: ""
      # - useful for server side enforcement of sampling parameters
      # - the `model` parameter can never be removed
      # - can be any JSON key in the request body
      # - recommended to stick to sampling parameters
      stripParams: "temperature, top_p, top_k"

      # setParams: a dictionary of parameters to set/override in requests
      # - optional, default: empty dictionary
      # - useful for enforcing specific parameter values
      # - protected params like "model" cannot be overridden
      # - values can be strings, numbers, booleans, arrays, or objects
      # - always runs for the model
      setParams:
        # Example: enforce specific sampling parameters
        temperature: 0.7
        top_p: 0.9

      # setParamsByID: a dictionary of parameters to set based the model ID
      # - optional, default: empty dictionary
      # - combine with aliases to create variant behaviour without reloading the model
      # - parameters are set in the request body JSON
      # - run after setParams so it will override any settings
      # - protected params like "model" cannot be overridden
      # - values can be strings, numbers, booleans, arrays, or objects
      # - model aliases will be automatically created for each key
      setParamsByID:
        "${MODEL_ID}":
          chat_template_kwargs:
            reasoning_effort: medium
        "${MODEL_ID}:high":
          chat_template_kwargs:
            reasoning_effort: high
        "${MODEL_ID}:low":
          chat_template_kwargs:
            reasoning_effort: low

    # aliases: alternative model names that this model configuration is used for
    # - optional, default: empty array
    # - aliases must be unique globally
    # - useful for impersonating a specific model
    aliases:
      - "gpt-4o-mini"

    # metadata: a dictionary of arbitrary values that are included in /v1/models
    # - optional, default: empty dictionary
    # - while metadata can contains complex types it is recommended to keep it simple
    # - metadata is only passed through in /v1/models responses
    metadata:
      # port will remain an integer
      port: ${PORT}

      # the ${temp} macro will remain a float
      temperature: ${temp}
      note: "The ${MODEL_ID} is running on port ${PORT} temp=${temp}, context=${default_ctx}"

      a_list:
        - 1
        - 1.23
        - "macros are OK in list and dictionary types: ${MODEL_ID}"

      an_obj:
        a: "1"
        b: 2
        # objects can contain complex types with macro substitution
        # becomes: c: [0.7, false, "model: llama"]
        c: ["${temp}", false, "model: ${MODEL_ID}"]

    # concurrencyLimit: overrides the allowed number of active parallel requests to a model
    # - optional, default: 0
    # - useful for limiting the number of active parallel requests a model can process
    # - must be set per model
    # - any number greater than 0 will override the internal default value of 10
    # - any requests that exceeds the limit will receive an HTTP 429 Too Many Requests response
    # - recommended to be omitted and the default used
    concurrencyLimit: 0

    # sendLoadingState: overrides the global sendLoadingState setting for this model
    # - optional, default: undefined (use global setting)
    sendLoadingState: false

  # Unlisted model example:
  "qwen-unlisted":
    # unlisted: boolean, true or false
    # - optional, default: false
    # - unlisted models do not show up in /v1/models api requests
    # - can be requested as normal through all apis
    unlisted: true
    cmd: llama-server --port ${PORT} -m Llama-3.2-1B-Instruct-Q4_K_M.gguf -ngl 0

  # Docker example:
  # container runtimes like Docker and Podman can be used reliably with
  # a combination of cmd, cmdStop, and ${MODEL_ID}
  "docker-llama":
    proxy: "http://127.0.0.1:${PORT}"
    cmd: |
      docker run --name ${MODEL_ID}
      --init --rm -p ${PORT}:8080 -v /mnt/nvme/models:/models
      ghcr.io/ggml-org/llama.cpp:server
      --model '/models/Qwen2.5-Coder-0.5B-Instruct-Q4_K_M.gguf'

    # cmdStop: command to run to stop the model gracefully
    # - optional, default: ""
    # - useful for stopping commands managed by another system
    # - the upstream's process id is available in the ${PID} macro
    #
    # When empty, llama-swap has this default behaviour:
    # - on POSIX systems: a SIGTERM signal is sent
    # - on Windows, calls taskkill to stop the process
    # - processes have 5 seconds to shutdown until forceful termination is attempted
    cmdStop: docker stop ${MODEL_ID}

# groups: a dictionary of group settings
# - optional, default: empty dictionary
# - provides advanced controls over model swapping behaviour
# - using groups some models can be kept loaded indefinitely, while others are swapped out
# - model IDs must be defined in the Models section
# - a model can only be a member of one group
# - group behaviour is controlled via the `swap`, `exclusive` and `persistent` fields
# - see issue #109 for details
#
# NOTE: the example below uses model names that are not defined above for demonstration purposes
groups:
  # group1 works the same as the default behaviour of llama-swap where only one model is allowed
  # to run a time across the whole llama-swap instance
  "group1":
    # swap: controls the model swapping behaviour in within the group
    # - optional, default: true
    # - true : only one model is allowed to run at a time
    # - false: all models can run together, no swapping
    swap: true

    # exclusive: controls how the group affects other groups
    # - optional, default: true
    # - true: causes all other groups to unload when this group runs a model
    # - false: does not affect other groups
    exclusive: true

    # members references the models defined above
    # required
    members:
      - "llama"
      - "qwen-unlisted"

  # Example:
  # - in group2 all models can run at the same time
  # - when a different group is loaded it causes all running models in this group to unload
  "group2":
    swap: false

    # exclusive: false does not unload other groups when a model in group2 is requested
    # - the models in group2 will be loaded but will not unload any other groups
    exclusive: false
    members:
      - "docker-llama"
      - "modelA"
      - "modelB"

  # Example:
  # - a persistent group, prevents other groups from unloading it
  "forever":
    # persistent: prevents over groups from unloading the models in this group
    # - optional, default: false
    # - does not affect individual model behaviour
    persistent: true

    # set swap/exclusive to false to prevent swapping inside the group
    # and the unloading of other groups
    swap: false
    exclusive: false
    members:
      - "forever-modelA"
      - "forever-modelB"
      - "forever-modelc"

# hooks: a dictionary of event triggers and actions
# - optional, default: empty dictionary
# - the only supported hook is on_startup
hooks:
  # on_startup: a dictionary of actions to perform on startup
  # - optional, default: empty dictionary
  # - the only supported action is preload
  on_startup:
    # preload: a list of model ids to load on startup
    # - optional, default: empty list
    # - model names must match keys in the models sections
    # - when preloading multiple models at once, define a group
    #   otherwise models will be loaded and swapped out
    preload:
      - "llama"

# peers: a dictionary of remote peers and models they provide
# - optional, default empty dictionary
# - peers can be another llama-swap
# - peers can be any server that provides the /v1/ generative api endpoints supported by llama-swap
peers:
  # keys is the peer'd ID
  llama-swap-peer:
    # proxy: a valid base URL to proxy requests to
    # - required
    # - requested path to llama-swap will be appended to the end of the proxy value
    proxy: http://192.168.1.23
    # models: a list of models served by the peer
    # - required
    models:
      - model_a
      - model_b
      - embeddings/model_c
  openrouter:
    proxy: https://openrouter.ai/api
    # apiKey: a string key to be injected into the request
    # - optional, default: ""
    # - if blank, no key will be added to the request
    # - key will be injected into headers: Authorization: Bearer <key> and x-api-key: <key>
    # - can be a string or a macro
    apiKey: ${env.OPENROUTER_API_KEY}
    models:
      - meta-llama/llama-3.1-8b-instruct
      - qwen/qwen3-235b-a22b-2507
      - deepseek/deepseek-v3.2
      - z-ai/glm-4.7
      - moonshotai/kimi-k2-0905
      - minimax/minimax-m2.1
    # filters: a dictionary of filter settings for peer requests
    # - optional, default: empty dictionary
    # - same capabilities as model filters (stripParams, setParams)
    filters:
      # stripParams: a comma separated list of parameters to remove from the request
      # - optional, default: ""
      # - useful for removing parameters that the peer doesn't support
      # - the `model` parameter can never be removed
      stripParams: "temperature, top_p"

      # setParams: a dictionary of parameters to set/override in requests to this peer
      # - optional, default: empty dictionary
      # - useful for injecting provider-specific settings like data retention policies
      # - protected params like "model" cannot be overridden
      # - values can be strings, numbers, booleans, arrays, or objects
      setParams:
        # Example: enforce zero-data-retention for OpenRouter
        provider:
          data_collection: "deny"
          zdr: true