-
-
Notifications
You must be signed in to change notification settings - Fork 47
Expand file tree
/
Copy pathservice.yaml
More file actions
110 lines (110 loc) · 3.08 KB
/
service.yaml
File metadata and controls
110 lines (110 loc) · 3.08 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
apiVersion: llmaz.io/v1alpha1
kind: OpenModel
metadata:
name: llama3-405b-instruct
spec:
familyName: llama3
source:
modelHub:
modelID: meta-llama/Llama-3.1-405B
---
apiVersion: inference.llmaz.io/v1alpha1
kind: Service
metadata:
name: llama3-405b-instruct
spec:
modelClaims:
models:
- name: llama3-405b-instruct
replicas: 2
workloadTemplate:
size: 2
restartPolicy: RecreateGroupOnPodRestart
leaderTemplate:
metadata:
labels:
role: leader
spec:
containers:
- name: model-runner
image: lmsysorg/sglang:latest
env:
- name: HUGGING_FACE_HUB_TOKEN
value: <your-hf-token>
- name: LWS_WORKER_INDEX
valueFrom:
fieldRef:
fieldPath: metadata.labels['leaderworkerset.sigs.k8s.io/worker-index']
command:
- python3
- -m
- sglang.launch_server
- --model-path
- /workspace/models/models--meta-llama--Meta-Llama-3.1-8B-Instruct
- --tp
- "2" # Size of Tensor Parallelism
- --dist-init-addr
- $(LWS_LEADER_ADDRESS):20000
- --nnodes
- $(LWS_GROUP_SIZE)
- --node-rank
- $(LWS_WORKER_INDEX)
- --trust-remote-code
- --host
- "0.0.0.0"
- --port
- "40000"
resources:
limits:
nvidia.com/gpu: "1"
ports:
- containerPort: 40000
readinessProbe:
tcpSocket:
port: 40000
initialDelaySeconds: 15
periodSeconds: 10
volumeMounts:
- mountPath: /dev/shm
name: dshm
volumes:
- name: dshm
emptyDir:
medium: Memory
workerTemplate:
spec:
containers:
- name: model-runner
image: lmsysorg/sglang:latest
env:
- name: HUGGING_FACE_HUB_TOKEN
value: <your-hf-token>
- name: LWS_WORKER_INDEX
valueFrom:
fieldRef:
fieldPath: metadata.labels['leaderworkerset.sigs.k8s.io/worker-index']
command:
- python3
- -m
- sglang.launch_server
- --model-path
- /workspace/models/models--meta-llama--Meta-Llama-3.1-8B-Instruct
- --tp
- "2" # Size of Tensor Parallelism
- --dist-init-addr
- $(LWS_LEADER_ADDRESS):20000
- --nnodes
- $(LWS_GROUP_SIZE)
- --node-rank
- $(LWS_WORKER_INDEX)
- --trust-remote-code
resources:
limits:
nvidia.com/gpu: "1"
volumeMounts:
- mountPath: /dev/shm
name: dshm
volumes:
- name: dshm
emptyDir:
medium: Memory