ome-projects · TJ5 · Mar 4, 2026 · Mar 5, 2026 · Mar 5, 2026 · Mar 5, 2026
@@ -139,6 +139,7 @@ This document provides a comprehensive reference of all models supported by SGLa
 |-------------------------|----------------------------------------|-------------------------|---------------------------------|--------|---------|----------------|------------|
 | GPT-OSS 20B             | `openai/gpt-oss-20b`                   | 20B                     | GPTOSSForCausalLM               | 40 GB  | -       | No             | Configured |
 | GPT-OSS 120B            | `openai/gpt-oss-120b`                  | 120B                    | GPTOSSForCausalLM               | 240 GB | -       | No             | Configured |
+| GPT-OSS 120B Eagle3 (draft) | `nvidia/gpt-oss-120b-eagle3-long-context` | Draft for spec decoding | GptOssForCausalLM               | -      | -       | No             | Configured |
 | ChatGLM2 6B             | `THUDM/chatglm2-6b`                    | 6B                      | ChatGLMForConditionalGeneration | 12 GB  | 32K     | No             | Configured |
 | GLM-4 9B Chat           | `ZhipuAI/glm-4-9b-chat`                | 9B                      | ChatGLMForConditionalGeneration | 18 GB  | 1M      | No             | Configured |
 | InternLM2 7B            | `internlm/internlm2-7b`                | 7B                      | InternLM2ForCausalLM            | 14 GB  | 32K     | No             | Configured |

@@ -124,6 +124,7 @@ resources:
   - nvidia/Llama-3.1-Nemotron-Nano-8B-v1.yaml
   - nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16.yaml
   - nvidia/NVIDIA-Nemotron-Nano-9B-v2.yaml
+  - nvidia/gpt-oss-120b-eagle3-long-context.yaml
 
   # openai
   - openai/clip-vit-large-patch14-336.yaml

@@ -0,0 +1,22 @@
+apiVersion: ome.io/v1beta1
+kind: ClusterBaseModel
+metadata:
+  name: gpt-oss-120b-eagle3-long-context
+spec:
+  modelCapabilities:
+    - TEXT_TO_TEXT
+  vendor: nvidia
+  disabled: false
+  version: "1.0.0"
+  displayName: nvidia.gpt-oss-120b-eagle3-long-context
+  modelFramework:
+    name: transformers
+    version: "4.55.0"
+  modelFormat:
+    name: safetensors
+    version: "1.0.0"
+  modelArchitecture: GptOssForCausalLM
+  storage:
+    storageUri: hf://nvidia/gpt-oss-120b-eagle3-long-context
+    path: /raid/models/nvidia/gpt-oss-120b-eagle3-long-context
+    key: "hf-token"
@@ -8,6 +8,7 @@ resources:
 - srt/e5-mistral-7b-instruct-rt.yaml
 - srt/gpt-oss-120b-rt.yaml
 - srt/gpt-oss-20b-rt.yaml
+- srt/openai/gpt-oss-120b-eagle3-rt.yaml
 - srt/kimi-k2-pd-rt.yaml
 - srt/meta/llama-3-1-70b-instruct-pd-rt.yaml
 - srt/meta/llama-3-1-70b-instruct-rt.yaml

@@ -0,0 +1,117 @@
+apiVersion: ome.io/v1beta1
+kind: ClusterServingRuntime
+metadata:
+  name: srt-gpt-oss-120b-eagle3
+spec:
+  disabled: false
+  supportedModelFormats:
+    - modelFramework:
+        name: transformers
+        version: "4.55.0"
+      modelFormat:
+        name: safetensors
+        version: "1.0.0"
+      modelArchitecture: GptOssForCausalLM
+      autoSelect: false
+      priority: 1
+  modelSizeRange:
+    min: 115B
+    max: 125B
+  draftModelSizeRange:
+    min: 0.1B
+    max: 1B
+  supportedDraftModelFormats:
+    - modelFormat:
+        name: safetensors
+        version: "1.0.0"
+  protocolVersions:
+    - openAI
+  engineConfig:
+    annotations:
+      prometheus.io/scrape: "true"
+      prometheus.io/port: "8080"
+      prometheus.io/path: "/metrics"
+    labels:
+      logging-forward: enabled
+    tolerations:
+      - key: "nvidia.com/gpu"
+        operator: "Exists"
+        effect: "NoSchedule"
+    volumes:
+      - name: dshm
+        emptyDir:
+          medium: Memory
+    affinity:
+      nodeAffinity:
+        requiredDuringSchedulingIgnoredDuringExecution:
+          nodeSelectorTerms:
+            - matchExpressions:
+                - key: node.kubernetes.io/instance-type
+                  operator: In
+                  values:
+                    - BM.GPU.H100.8
+    runner:
+      name: ome-container
+      image: docker.io/lmsysorg/sglang:v0.5.8.post1-cu130-amd64
+      ports:
+        - containerPort: 8080
+          name: http1
+          protocol: TCP
+      command:
+        - /bin/bash
+        - '-lc'
+        - --
+      args:
+        - |
+          python3 -m sglang.launch_server \
+          --host=0.0.0.0 \
+          --port=8080 \
+          --enable-metrics \
+          --log-requests \
+          --log-requests-level=1 \
+          --model-path="$MODEL_PATH" \
+          --tp=2 \
+          --tool-call-parser=gpt-oss \
+          --reasoning-parser=gpt-oss \
+          --speculative-draft-model-path="$DRAFT_MODEL_PATH" \
+          --speculative-algorithm=EAGLE3 \
+          --speculative-num-steps=3 \
+          --speculative-eagle-topk=8 \
+          --speculative-num-draft-tokens=16
+      volumeMounts:
+        - mountPath: /dev/shm
+          name: dshm
+      resources:
+        requests:
+          cpu: 30
+          memory: 256Gi
+          nvidia.com/gpu: 2
+        limits:
+          cpu: 30
+          memory: 256Gi
+          nvidia.com/gpu: 2
+      readinessProbe:
+        httpGet:
+          path: /health_generate
+          port: 8080
+        failureThreshold: 3
+        successThreshold: 1
+        periodSeconds: 60
+        timeoutSeconds: 200
+      livenessProbe:
+        httpGet:
+          path: /health
+          port: 8080
+        failureThreshold: 5
+        successThreshold: 1
+        periodSeconds: 60
+        timeoutSeconds: 60
+      startupProbe:
+        httpGet:
+          path: /health_generate
+          port: 8080
+        failureThreshold: 150
+        successThreshold: 1
+        periodSeconds: 6
+        initialDelaySeconds: 60
+        timeoutSeconds: 30
@@ -0,0 +1,17 @@
+apiVersion: ome.io/v1beta1
+kind: InferenceService
+metadata:
+  name: gpt-oss-120b-eagle3
+  namespace: ome
+  annotations:
+    ome.io/deploymentMode: RawDeployment
+spec:
+  model:
+    name: gpt-oss-120b
+  draftModel:
+    name: gpt-oss-120b-eagle3-long-context
+  runtime:
+    name: srt-gpt-oss-120b-eagle3
+  engine:
+    minReplicas: 1
+    maxReplicas: 1