Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions config/models/SUPPORTED_MODELS.md
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,7 @@ This document provides a comprehensive reference of all models supported by SGLa
|-------------------------|----------------------------------------|-------------------------|---------------------------------|--------|---------|----------------|------------|
| GPT-OSS 20B | `openai/gpt-oss-20b` | 20B | GPTOSSForCausalLM | 40 GB | - | No | Configured |
| GPT-OSS 120B | `openai/gpt-oss-120b` | 120B | GPTOSSForCausalLM | 240 GB | - | No | Configured |
| GPT-OSS 120B Eagle3 (draft) | `nvidia/gpt-oss-120b-eagle3-long-context` | Draft for spec decoding | GptOssForCausalLM | - | - | No | Configured |
| ChatGLM2 6B | `THUDM/chatglm2-6b` | 6B | ChatGLMForConditionalGeneration | 12 GB | 32K | No | Configured |
| GLM-4 9B Chat | `ZhipuAI/glm-4-9b-chat` | 9B | ChatGLMForConditionalGeneration | 18 GB | 1M | No | Configured |
| InternLM2 7B | `internlm/internlm2-7b` | 7B | InternLM2ForCausalLM | 14 GB | 32K | No | Configured |
Expand Down
1 change: 1 addition & 0 deletions config/models/kustomization.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,7 @@ resources:
- nvidia/Llama-3.1-Nemotron-Nano-8B-v1.yaml
- nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16.yaml
- nvidia/NVIDIA-Nemotron-Nano-9B-v2.yaml
- nvidia/gpt-oss-120b-eagle3-long-context.yaml

# openai
- openai/clip-vit-large-patch14-336.yaml
Expand Down
22 changes: 22 additions & 0 deletions config/models/nvidia/gpt-oss-120b-eagle3-long-context.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
apiVersion: ome.io/v1beta1
kind: ClusterBaseModel
metadata:
name: gpt-oss-120b-eagle3-long-context
spec:
modelCapabilities:
- TEXT_TO_TEXT
vendor: nvidia
disabled: false
version: "1.0.0"
displayName: nvidia.gpt-oss-120b-eagle3-long-context
modelFramework:
name: transformers
version: "4.55.0"
modelFormat:
name: safetensors
version: "1.0.0"
modelArchitecture: GptOssForCausalLM
storage:
storageUri: hf://nvidia/gpt-oss-120b-eagle3-long-context
path: /raid/models/nvidia/gpt-oss-120b-eagle3-long-context
key: "hf-token"
1 change: 1 addition & 0 deletions config/runtimes/kustomization.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ resources:
- srt/e5-mistral-7b-instruct-rt.yaml
- srt/gpt-oss-120b-rt.yaml
- srt/gpt-oss-20b-rt.yaml
- srt/openai/gpt-oss-120b-eagle3-rt.yaml
- srt/kimi-k2-pd-rt.yaml
- srt/meta/llama-3-1-70b-instruct-pd-rt.yaml
- srt/meta/llama-3-1-70b-instruct-rt.yaml
Expand Down
117 changes: 117 additions & 0 deletions config/runtimes/srt/openai/gpt-oss-120b-eagle3-rt.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
apiVersion: ome.io/v1beta1
kind: ClusterServingRuntime
metadata:
name: srt-gpt-oss-120b-eagle3
spec:
disabled: false
supportedModelFormats:
- modelFramework:
name: transformers
version: "4.55.0"
modelFormat:
name: safetensors
version: "1.0.0"
modelArchitecture: GptOssForCausalLM
autoSelect: false
priority: 1
modelSizeRange:
min: 115B
max: 125B
draftModelSizeRange:
min: 0.1B
max: 1B
supportedDraftModelFormats:
- modelFormat:
name: safetensors
version: "1.0.0"
protocolVersions:
- openAI
engineConfig:
annotations:
prometheus.io/scrape: "true"
prometheus.io/port: "8080"
prometheus.io/path: "/metrics"
labels:
logging-forward: enabled
tolerations:
- key: "nvidia.com/gpu"
operator: "Exists"
effect: "NoSchedule"
volumes:
- name: dshm
emptyDir:
medium: Memory
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: node.kubernetes.io/instance-type
operator: In
values:
- BM.GPU.H100.8
runner:
name: ome-container
image: docker.io/lmsysorg/sglang:v0.5.8.post1-cu130-amd64
ports:
- containerPort: 8080
name: http1
protocol: TCP
command:
- /bin/bash
- '-lc'
- --
args:
- |
python3 -m sglang.launch_server \
--host=0.0.0.0 \
--port=8080 \
--enable-metrics \
--log-requests \
--log-requests-level=1 \
--model-path="$MODEL_PATH" \
--tp=2 \
--tool-call-parser=gpt-oss \
--reasoning-parser=gpt-oss \
--speculative-draft-model-path="$DRAFT_MODEL_PATH" \
--speculative-algorithm=EAGLE3 \
--speculative-num-steps=3 \
--speculative-eagle-topk=8 \
--speculative-num-draft-tokens=16
volumeMounts:
- mountPath: /dev/shm
name: dshm
resources:
requests:
cpu: 30
memory: 256Gi
nvidia.com/gpu: 2
limits:
cpu: 30
memory: 256Gi
nvidia.com/gpu: 2
readinessProbe:
httpGet:
path: /health_generate
port: 8080
failureThreshold: 3
successThreshold: 1
periodSeconds: 60
timeoutSeconds: 200
livenessProbe:
httpGet:
path: /health
port: 8080
failureThreshold: 5
successThreshold: 1
periodSeconds: 60
timeoutSeconds: 60
startupProbe:
httpGet:
path: /health_generate
port: 8080
failureThreshold: 150
successThreshold: 1
periodSeconds: 6
initialDelaySeconds: 60
timeoutSeconds: 30
17 changes: 17 additions & 0 deletions config/samples/isvc/openai/gpt-oss-120b-eagle3.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
apiVersion: ome.io/v1beta1
kind: InferenceService
metadata:
name: gpt-oss-120b-eagle3
namespace: ome
annotations:
ome.io/deploymentMode: RawDeployment
spec:
model:
name: gpt-oss-120b
draftModel:
name: gpt-oss-120b-eagle3-long-context
runtime:
name: srt-gpt-oss-120b-eagle3
engine:
minReplicas: 1
maxReplicas: 1
Loading