-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathray-llama-cpp.service
More file actions
74 lines (70 loc) · 2.19 KB
/
Copy pathray-llama-cpp.service
File metadata and controls
74 lines (70 loc) · 2.19 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
[Unit]
Description=llama.cpp Server for Ray
After=network.target
StartLimitIntervalSec=60
StartLimitBurst=10
[Service]
Type=simple
User=ray
Environment="LLAMA_ARG_MODEL=/var/lib/ray/models/qwen2.5-0.5b-instruct-q4_k_m.gguf"
Environment="LLAMA_ARG_ALIAS=qwen2.5-0.5b-instruct-q4_k_m"
Environment="LLAMA_ARG_HOST=127.0.0.1"
Environment="LLAMA_ARG_PORT=8081"
Environment="LLAMA_ARG_CTX_SIZE=3072"
Environment="LLAMA_ARG_N_PARALLEL=2"
Environment="LLAMA_ARG_THREADS=2"
Environment="LLAMA_ARG_THREADS_HTTP=2"
Environment="LLAMA_ARG_BATCH=256"
Environment="LLAMA_ARG_BATCH_SIZE=256"
Environment="LLAMA_ARG_UBATCH=128"
Environment="LLAMA_ARG_UBATCH_SIZE=128"
Environment="LLAMA_ARG_CACHE_PROMPT=1"
Environment="LLAMA_ARG_CACHE_REUSE=256"
Environment="LLAMA_ARG_CACHE_RAM=512"
Environment="LLAMA_ARG_CONT_BATCHING=1"
Environment="LLAMA_ARG_ENDPOINT_METRICS=1"
Environment="LLAMA_ARG_ENDPOINT_SLOTS=1"
Environment="LLAMA_ARG_WARMUP=1"
Environment="LLAMA_ARG_KV_UNIFIED=1"
Environment="LLAMA_ARG_CACHE_IDLE_SLOTS=1"
Environment="LLAMA_ARG_CONTEXT_SHIFT=1"
ExecStart=/usr/local/bin/llama-server --model /var/lib/ray/models/qwen2.5-0.5b-instruct-q4_k_m.gguf --alias qwen2.5-0.5b-instruct-q4_k_m --host 127.0.0.1 --port 8081 --ctx-size 3072 --parallel 2 --threads 2 --threads-http 2 --batch-size 256 --ubatch-size 128 --cache-prompt --cache-reuse 256 --cache-ram 512 --cont-batching --metrics --slots --warmup --kv-unified --cache-idle-slots --context-shift
Restart=always
RestartSec=2
LogRateLimitIntervalSec=30s
LogRateLimitBurst=200
TimeoutStopSec=35
KillSignal=SIGTERM
KillMode=mixed
OOMPolicy=stop
OOMScoreAdjust=250
TasksMax=256
CPUAccounting=true
CPUWeight=80
MemoryAccounting=true
IOAccounting=true
MemoryHigh=2142M
MemoryMax=2380M
MemorySwapMax=595M
NoNewPrivileges=true
CapabilityBoundingSet=
SystemCallArchitectures=native
PrivateTmp=true
PrivateDevices=true
ProtectSystem=full
ProtectHome=true
ProtectClock=true
ProtectHostname=true
ProtectControlGroups=true
ProtectKernelModules=true
ProtectKernelTunables=true
LockPersonality=true
MemoryDenyWriteExecute=true
RestrictAddressFamilies=AF_UNIX AF_INET AF_INET6
RestrictNamespaces=true
RestrictRealtime=true
RestrictSUIDSGID=true
UMask=077
LimitNOFILE=4096
[Install]
WantedBy=multi-user.target