diff --git a/config/models/MiniMaxAI/MiniMax-M2.5.yaml b/config/models/MiniMaxAI/MiniMax-M2.5.yaml new file mode 100644 index 00000000..9050e941 --- /dev/null +++ b/config/models/MiniMaxAI/MiniMax-M2.5.yaml @@ -0,0 +1,24 @@ +apiVersion: ome.io/v1beta1 +kind: ClusterBaseModel +metadata: + name: minimax-m2-5 +spec: + modelCapabilities: + - TEXT_TO_TEXT + vendor: MiniMaxAI + displayName: minimax.minimax-m2-5 + modelArchitecture: MiniMaxM2ForCausalLM + disabled: false + version: "1.0.0" + modelFormat: + name: safetensors + version: "1.0.0" + modelFramework: + name: transformers + version: "4.46.1" + modelParameterSize: 229B + quantization: fp8 + storage: + storageUri: hf://MiniMaxAI/MiniMax-M2.5 + path: /raid/models/MiniMaxAI/MiniMax-M2.5 + key: hf-token diff --git a/config/models/minimax/MiniMax-M2.yaml b/config/models/MiniMaxAI/MiniMax-M2.yaml similarity index 100% rename from config/models/minimax/MiniMax-M2.yaml rename to config/models/MiniMaxAI/MiniMax-M2.yaml diff --git a/config/models/mistralai/Devstral-2-123B-Instruct-2512.yaml b/config/models/mistralai/Devstral-2-123B-Instruct-2512.yaml new file mode 100644 index 00000000..6396b636 --- /dev/null +++ b/config/models/mistralai/Devstral-2-123B-Instruct-2512.yaml @@ -0,0 +1,24 @@ +apiVersion: ome.io/v1beta1 +kind: ClusterBaseModel +metadata: + name: devstral-2-123b-instruct-2512 +spec: + modelCapabilities: + - TEXT_TO_TEXT + vendor: mistralai + displayName: mistralai.devstral-2-123b-instruct-2512 + modelArchitecture: Ministral3ForCausalLM + disabled: false + version: "1.0.0" + modelFormat: + name: safetensors + version: "1.0.0" + modelFramework: + name: transformers + version: "5.0.0.dev0" + modelParameterSize: 123B + quantization: fp8 + storage: + storageUri: hf://mistralai/Devstral-2-123B-Instruct-2512 + path: /raid/models/mistralai/Devstral-2-123B-Instruct-2512 + key: hf-token diff --git a/config/models/mistralai/Devstral-Small-2-24B-Instruct-2512.yaml b/config/models/mistralai/Devstral-Small-2-24B-Instruct-2512.yaml new file mode 100644 index 00000000..01022a53 --- /dev/null +++ b/config/models/mistralai/Devstral-Small-2-24B-Instruct-2512.yaml @@ -0,0 +1,25 @@ +apiVersion: ome.io/v1beta1 +kind: ClusterBaseModel +metadata: + name: devstral-small-2-24b-instruct-2512 +spec: + modelCapabilities: + - IMAGE_TEXT_TO_TEXT + - VIDEO_TEXT_TO_TEXT + vendor: mistralai + displayName: mistralai.devstral-small-2-24b-instruct-2512 + modelArchitecture: Mistral3ForConditionalGeneration + disabled: false + version: "1.0.0" + modelFormat: + name: safetensors + version: "1.0.0" + modelFramework: + name: transformers + version: "5.0.0.dev0" + modelParameterSize: 24B + quantization: fp8 + storage: + storageUri: hf://mistralai/Devstral-Small-2-24B-Instruct-2512 + path: /raid/models/mistralai/Devstral-Small-2-24B-Instruct-2512 + key: hf-token diff --git a/config/models/zai-org/GLM-4.5V.yaml b/config/models/zai-org/GLM-4.5V.yaml index 08ef9cc0..ad283c29 100644 --- a/config/models/zai-org/GLM-4.5V.yaml +++ b/config/models/zai-org/GLM-4.5V.yaml @@ -8,8 +8,16 @@ spec: - VIDEO_TEXT_TO_TEXT vendor: zai-org displayName: zai-org.glm-4.5v + modelArchitecture: Glm4vMoeForConditionalGeneration disabled: false version: "1.0.0" + modelFormat: + name: safetensors + version: "1.0.0" + modelFramework: + name: transformers + version: "4.57.1" + modelParameterSize: 106B storage: storageUri: hf://zai-org/GLM-4.5V path: /raid/models/zai-org/GLM-4.5V diff --git a/config/models/zai-org/GLM-4.6-FP8.yaml b/config/models/zai-org/GLM-4.6-FP8.yaml new file mode 100644 index 00000000..a7d2d70a --- /dev/null +++ b/config/models/zai-org/GLM-4.6-FP8.yaml @@ -0,0 +1,24 @@ +apiVersion: ome.io/v1beta1 +kind: ClusterBaseModel +metadata: + name: glm-4-6-fp8 +spec: + modelCapabilities: + - TEXT_TO_TEXT + vendor: zai-org + displayName: zai-org.glm-4.6-fp8 + modelArchitecture: Glm4MoeForCausalLM + disabled: false + version: "1.0.0" + modelFormat: + name: safetensors + version: "1.0.0" + modelFramework: + name: transformers + version: "4.54.0" + modelParameterSize: 355B + quantization: fp8 + storage: + storageUri: hf://zai-org/GLM-4.6-FP8 + path: /raid/models/zai-org/GLM-4.6-FP8 + key: hf-token diff --git a/config/models/zai-org/GLM-4.7-FP8.yaml b/config/models/zai-org/GLM-4.7-FP8.yaml new file mode 100644 index 00000000..72ce2360 --- /dev/null +++ b/config/models/zai-org/GLM-4.7-FP8.yaml @@ -0,0 +1,24 @@ +apiVersion: ome.io/v1beta1 +kind: ClusterBaseModel +metadata: + name: glm-4-7-fp8 +spec: + modelCapabilities: + - TEXT_TO_TEXT + vendor: zai-org + displayName: zai-org.glm-4.7-fp8 + modelArchitecture: Glm4MoeForCausalLM + disabled: false + version: "1.0.0" + modelFormat: + name: safetensors + version: "1.0.0" + modelFramework: + name: transformers + version: "4.52.3" + modelParameterSize: 355B + quantization: fp8 + storage: + storageUri: hf://zai-org/GLM-4.7-FP8 + path: /raid/models/zai-org/GLM-4.7-FP8 + key: hf-token diff --git a/config/runtimes/kustomization.yaml b/config/runtimes/kustomization.yaml index 90295bb6..64cabb14 100644 --- a/config/runtimes/kustomization.yaml +++ b/config/runtimes/kustomization.yaml @@ -32,6 +32,7 @@ resources: - srt/mistral-7b-instruct-rt.yaml - srt/mixtral-8x7b-instruct-pd-rt.yaml - srt/mixtral-8x7b-instruct-rt.yaml +- srt/zai-org/glm-text-fp8-tp8-rt.yaml # vLLM runtimes - vllm/e5-mistral-7b-instruct-rt.yaml - vllm/llama-3-1-405b-instruct-fp8-rt.yaml @@ -50,3 +51,7 @@ resources: - vllm/llama-4-scout-17b-16e-instruct-rt.yaml - vllm/mistral-7b-instruct-rt.yaml - vllm/mixtral-8x7b-instruct-rt.yaml +- vllm/zai-org/glm-vl-tp4-rt.yaml +- vllm/minimax/minimax-m2-fp8-tp8-rt.yaml +- vllm/mistralai/devstral-2-123b-instruct-fp8-tp4-rt.yaml +- vllm/mistralai/devstral-small-2-24b-instruct-tp2-rt.yaml diff --git a/config/runtimes/srt/zai-org/glm-4-5v-rt.yaml b/config/runtimes/srt/zai-org/glm-4-5v-rt.yaml deleted file mode 100644 index 52fc588f..00000000 --- a/config/runtimes/srt/zai-org/glm-4-5v-rt.yaml +++ /dev/null @@ -1,160 +0,0 @@ -apiVersion: ome.io/v1beta1 -kind: ClusterServingRuntime -metadata: - name: srt-glm-4-5v -spec: - disabled: false - supportedModelFormats: - - modelFramework: - name: transformers - version: "4.57.1" - modelFormat: - name: safetensors - version: "1.0.0" - modelArchitecture: Glm4vMoeForConditionalGeneration - autoSelect: false - priority: 1 - protocolVersions: - - openAI - modelSizeRange: - min: 8B - max: 10B - engineConfig: - annotations: - prometheus.io/scrape: "true" - prometheus.io/port: "8080" - prometheus.io/path: "/metrics" - labels: - logging-forward: enabled - tolerations: - - key: "nvidia.com/gpu" - operator: "Exists" - effect: "NoSchedule" - volumes: - - name: dshm - emptyDir: - medium: Memory - runner: - name: ome-container - image: docker.io/lmsysorg/sglang:v0.5.5.post3-cu129-amd64 - ports: - - containerPort: 8080 - name: http1 - protocol: TCP - command: - - python3 - - -m - - sglang.launch_server - - --host - - "0.0.0.0" - - --port - - "8080" - - --enable-metrics - - --log-requests - - --model-path - - $(MODEL_PATH) - - --tp-size - - "1" - - --mem-frac - - "0.9" - - --served-model-name - - zai-org/GLM-4.5V - volumeMounts: - - mountPath: /dev/shm - name: dshm - resources: - requests: - cpu: 10 - memory: 30Gi - nvidia.com/gpu: 1 - limits: - cpu: 10 - memory: 30Gi - nvidia.com/gpu: 1 - - readinessProbe: - httpGet: - path: /health_generate - port: 8080 - failureThreshold: 3 - successThreshold: 1 - periodSeconds: 60 - timeoutSeconds: 200 - - livenessProbe: - httpGet: - path: /health - port: 8080 - failureThreshold: 5 - successThreshold: 1 - periodSeconds: 60 - timeoutSeconds: 60 - - startupProbe: - httpGet: - path: /health_generate - port: 8080 - failureThreshold: 150 - successThreshold: 1 - periodSeconds: 6 - initialDelaySeconds: 60 - timeoutSeconds: 30 - routerConfig: - runner: - name: router - image: fra.ocir.io/idqj093njucb/smg:v0.2.4.post1-dev - resources: - limits: - cpu: "1" - memory: "2Gi" - ports: - - containerPort: 8080 - name: http - command: - - python3 - - -m - - sglang_router.launch_router - - --host - - "0.0.0.0" - - --port - - "8080" - - --service-discovery - - --service-discovery-namespace - - $(NAMESPACE) - - --service-discovery-port - - "8080" - - --selector - - component=engine ome.io/inferenceservice=$(INFERENCESERVICE_NAME) - env: - - name: NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace - - name: INFERENCESERVICE_NAME - valueFrom: - fieldRef: - fieldPath: metadata.labels['ome.io/inferenceservice'] - readinessProbe: - httpGet: - path: /readiness - port: 8080 - failureThreshold: 5 - successThreshold: 1 - periodSeconds: 30 - timeoutSeconds: 10 - livenessProbe: - httpGet: - path: /liveness - port: 8080 - failureThreshold: 5 - successThreshold: 1 - periodSeconds: 30 - timeoutSeconds: 10 - startupProbe: - httpGet: - path: /readiness - port: 8080 - failureThreshold: 10 - successThreshold: 1 - periodSeconds: 20 - timeoutSeconds: 10 diff --git a/config/runtimes/srt/zai-org/glm-text-fp8-tp8-rt.yaml b/config/runtimes/srt/zai-org/glm-text-fp8-tp8-rt.yaml new file mode 100644 index 00000000..91cc9156 --- /dev/null +++ b/config/runtimes/srt/zai-org/glm-text-fp8-tp8-rt.yaml @@ -0,0 +1,123 @@ +apiVersion: ome.io/v1beta1 +kind: ClusterServingRuntime +metadata: + name: srt-glm-text-fp8-tp8 +spec: + disabled: false + supportedModelFormats: + - modelFramework: + name: transformers + version: "4.54.0" + modelFormat: + name: safetensors + version: "1.0.0" + modelArchitecture: Glm4MoeForCausalLM + quantization: fp8 + autoSelect: true + priority: 1 + - modelFramework: + name: transformers + version: "4.52.3" + modelFormat: + name: safetensors + version: "1.0.0" + modelArchitecture: Glm4MoeForCausalLM + quantization: fp8 + autoSelect: true + priority: 1 + protocolVersions: + - openAI + modelSizeRange: + min: 350B + max: 360B + engineConfig: + annotations: + prometheus.io/scrape: "true" + prometheus.io/port: "8080" + prometheus.io/path: "/metrics" + labels: + logging-forward: enabled + tolerations: + - key: "nvidia.com/gpu" + operator: "Exists" + effect: "NoSchedule" + volumes: + - name: dshm + emptyDir: + medium: Memory + runner: + name: ome-container + image: ghcr.io/lightseekorg/smg:1.3.2-sglang-v0.5.9 + ports: + - containerPort: 8080 + name: http1 + protocol: TCP + command: + - python3 + - -m + - sglang.launch_server + - --host + - "0.0.0.0" + - --port + - "8080" + - --enable-metrics + - --log-requests + - --model-path + - $(MODEL_PATH) + - --tp-size + - "8" + - --mem-fraction-static + - "0.8" + - --trust-remote-code + - --tool-call-parser + - glm47 + - --reasoning-parser + - glm45 + - --speculative-algorithm + - EAGLE + - --speculative-num-steps + - "3" + - --speculative-eagle-topk + - "1" + - --speculative-num-draft-tokens + - "4" + volumeMounts: + - mountPath: /dev/shm + name: dshm + resources: + requests: + cpu: 40 + memory: 640Gi + nvidia.com/gpu: 8 + limits: + cpu: 40 + memory: 640Gi + nvidia.com/gpu: 8 + + readinessProbe: + httpGet: + path: /health_generate + port: 8080 + failureThreshold: 3 + successThreshold: 1 + periodSeconds: 90 + timeoutSeconds: 20 + + livenessProbe: + httpGet: + path: /health + port: 8080 + failureThreshold: 5 + successThreshold: 1 + periodSeconds: 60 + timeoutSeconds: 60 + + startupProbe: + httpGet: + path: /health_generate + port: 8080 + failureThreshold: 150 + successThreshold: 1 + periodSeconds: 6 + initialDelaySeconds: 60 + timeoutSeconds: 30 diff --git a/config/runtimes/vllm/minimax/minimax-m2-fp8-tp8-rt.yaml b/config/runtimes/vllm/minimax/minimax-m2-fp8-tp8-rt.yaml new file mode 100644 index 00000000..2bc02952 --- /dev/null +++ b/config/runtimes/vllm/minimax/minimax-m2-fp8-tp8-rt.yaml @@ -0,0 +1,108 @@ +apiVersion: ome.io/v1beta1 +kind: ClusterServingRuntime +metadata: + name: vllm-minimax-m2-5-fp8-tp8 +spec: + disabled: false + supportedModelFormats: + - modelFramework: + name: transformers + version: "4.46.1" + modelFormat: + name: safetensors + version: "1.0.0" + modelArchitecture: MiniMaxM2ForCausalLM + quantization: fp8 + autoSelect: true + priority: 1 + protocolVersions: + - openAI + modelSizeRange: + min: 220B + max: 235B + engineConfig: + annotations: + prometheus.io/scrape: "true" + prometheus.io/port: "8080" + prometheus.io/path: "/metrics" + labels: + logging-forward: enabled + tolerations: + - key: "nvidia.com/gpu" + operator: "Exists" + effect: "NoSchedule" + volumes: + - name: dshm + emptyDir: + medium: Memory + runner: + name: ome-container + image: ghcr.io/lightseekorg/smg:1.3.2-vllm-v0.17.0 + env: + - name: SAFETENSORS_FAST_GPU + value: "1" + ports: + - containerPort: 8080 + name: http1 + protocol: TCP + command: + - python3 + - -m + - vllm.entrypoints.openai.api_server + - --host + - "0.0.0.0" + - --port + - "8080" + - --model + - $(MODEL_PATH) + - --tensor-parallel-size + - "8" + - --gpu-memory-utilization + - "0.9" + - --trust-remote-code + - --enable_expert_parallel + - --enable-auto-tool-choice + - --tool-call-parser + - minimax_m2 + - --reasoning-parser + - minimax_m2_append_think + volumeMounts: + - mountPath: /dev/shm + name: dshm + resources: + requests: + cpu: 40 + memory: 640Gi + nvidia.com/gpu: 8 + limits: + cpu: 40 + memory: 640Gi + nvidia.com/gpu: 8 + + readinessProbe: + httpGet: + path: /health + port: 8080 + failureThreshold: 3 + successThreshold: 1 + periodSeconds: 90 + timeoutSeconds: 20 + + livenessProbe: + httpGet: + path: /health + port: 8080 + failureThreshold: 5 + successThreshold: 1 + periodSeconds: 60 + timeoutSeconds: 60 + + startupProbe: + httpGet: + path: /health + port: 8080 + failureThreshold: 150 + successThreshold: 1 + periodSeconds: 6 + initialDelaySeconds: 60 + timeoutSeconds: 30 diff --git a/config/runtimes/srt/minimax/minimax-m2-rt.yaml b/config/runtimes/vllm/mistralai/devstral-2-123b-instruct-fp8-tp4-rt.yaml similarity index 83% rename from config/runtimes/srt/minimax/minimax-m2-rt.yaml rename to config/runtimes/vllm/mistralai/devstral-2-123b-instruct-fp8-tp4-rt.yaml index 8d1a38dd..60532675 100644 --- a/config/runtimes/srt/minimax/minimax-m2-rt.yaml +++ b/config/runtimes/vllm/mistralai/devstral-2-123b-instruct-fp8-tp4-rt.yaml @@ -1,24 +1,25 @@ apiVersion: ome.io/v1beta1 kind: ClusterServingRuntime metadata: - name: srt-minimax-m2 + name: vllm-devstral-2-123b-fp8-tp4 spec: disabled: false supportedModelFormats: - modelFramework: name: transformers - version: "4.57.1" + version: "5.0.0.dev0" modelFormat: name: safetensors version: "1.0.0" - modelArchitecture: MiniMaxM2ForCausalLM - autoSelect: false + modelArchitecture: Ministral3ForCausalLM + quantization: fp8 + autoSelect: true priority: 1 protocolVersions: - openAI modelSizeRange: - min: 40B - max: 50B + min: 115B + max: 130B engineConfig: annotations: prometheus.io/scrape: "true" @@ -36,7 +37,7 @@ spec: medium: Memory runner: name: ome-container - image: docker.io/lmsysorg/sglang:v0.5.5.post3-cu129-amd64 + image: ghcr.io/lightseekorg/smg:1.3.2-vllm-v0.17.0 ports: - containerPort: 8080 name: http1 @@ -44,42 +45,41 @@ spec: command: - python3 - -m - - sglang.launch_server + - vllm.entrypoints.openai.api_server - --host - "0.0.0.0" - --port - "8080" - - --enable-metrics - - --log-requests - - --model-path + - --model - $(MODEL_PATH) - - --tp-size + - --tensor-parallel-size - "4" - - --mem-frac + - --gpu-memory-utilization - "0.9" - - --served-model-name - - minimax/MiniMax-M2 + - --enable-auto-tool-choice + - --tool-call-parser + - mistral volumeMounts: - mountPath: /dev/shm name: dshm resources: requests: cpu: 20 - memory: 160Gi + memory: 320Gi nvidia.com/gpu: 4 limits: cpu: 20 - memory: 160Gi + memory: 320Gi nvidia.com/gpu: 4 readinessProbe: httpGet: - path: /health_generate + path: /health port: 8080 failureThreshold: 3 successThreshold: 1 - periodSeconds: 60 - timeoutSeconds: 200 + periodSeconds: 90 + timeoutSeconds: 20 livenessProbe: httpGet: @@ -92,7 +92,7 @@ spec: startupProbe: httpGet: - path: /health_generate + path: /health port: 8080 failureThreshold: 150 successThreshold: 1 diff --git a/config/runtimes/srt/mistralai/mistral-small-3-1-24b-instruct-2503-rt.yaml b/config/runtimes/vllm/mistralai/devstral-small-2-24b-instruct-tp2-rt.yaml similarity index 82% rename from config/runtimes/srt/mistralai/mistral-small-3-1-24b-instruct-2503-rt.yaml rename to config/runtimes/vllm/mistralai/devstral-small-2-24b-instruct-tp2-rt.yaml index 801e60c2..69559978 100644 --- a/config/runtimes/srt/mistralai/mistral-small-3-1-24b-instruct-2503-rt.yaml +++ b/config/runtimes/vllm/mistralai/devstral-small-2-24b-instruct-tp2-rt.yaml @@ -1,7 +1,7 @@ apiVersion: ome.io/v1beta1 kind: ClusterServingRuntime metadata: - name: srt-mistral-small-3-1-24b-instruct-2503 + name: vllm-devstral-small-2-24b-instruct-tp2 spec: disabled: false supportedModelFormats: @@ -14,6 +14,16 @@ spec: modelArchitecture: Mistral3ForConditionalGeneration autoSelect: false priority: 1 + - modelFramework: + name: transformers + version: "5.0.0.dev0" + modelFormat: + name: safetensors + version: "1.0.0" + modelArchitecture: Mistral3ForConditionalGeneration + quantization: fp8 + autoSelect: true + priority: 1 protocolVersions: - openAI modelSizeRange: @@ -36,7 +46,7 @@ spec: medium: Memory runner: name: ome-container - image: docker.io/lmsysorg/sglang:v0.5.5.post3-cu129-amd64 + image: ghcr.io/lightseekorg/smg:1.3.2-vllm-v0.17.0 ports: - containerPort: 8080 name: http1 @@ -44,21 +54,20 @@ spec: command: - python3 - -m - - sglang.launch_server + - vllm.entrypoints.openai.api_server - --host - "0.0.0.0" - --port - "8080" - - --enable-metrics - - --log-requests - - --model-path + - --model - $(MODEL_PATH) - - --tp-size + - --tensor-parallel-size - "2" - - --mem-frac + - --gpu-memory-utilization - "0.9" - - --served-model-name - - mistralai/Mistral-Small-3.1-24B-Instruct-2503 + - --enable-auto-tool-choice + - --tool-call-parser + - mistral volumeMounts: - mountPath: /dev/shm name: dshm @@ -74,12 +83,12 @@ spec: readinessProbe: httpGet: - path: /health_generate + path: /health port: 8080 failureThreshold: 3 successThreshold: 1 - periodSeconds: 60 - timeoutSeconds: 200 + periodSeconds: 90 + timeoutSeconds: 20 livenessProbe: httpGet: @@ -92,7 +101,7 @@ spec: startupProbe: httpGet: - path: /health_generate + path: /health port: 8080 failureThreshold: 150 successThreshold: 1 diff --git a/config/runtimes/vllm/zai-org/glm-vl-tp4-rt.yaml b/config/runtimes/vllm/zai-org/glm-vl-tp4-rt.yaml new file mode 100644 index 00000000..d59148f8 --- /dev/null +++ b/config/runtimes/vllm/zai-org/glm-vl-tp4-rt.yaml @@ -0,0 +1,107 @@ +apiVersion: ome.io/v1beta1 +kind: ClusterServingRuntime +metadata: + name: vllm-glm-vl-tp4 +spec: + disabled: false + supportedModelFormats: + - modelFramework: + name: transformers + version: "4.57.1" + modelFormat: + name: safetensors + version: "1.0.0" + modelArchitecture: Glm4vMoeForConditionalGeneration + autoSelect: true + priority: 1 + - modelFramework: + name: transformers + version: "5.0.0-rc0" + modelFormat: + name: safetensors + version: "1.0.0" + modelArchitecture: Glm4vMoeForConditionalGeneration + autoSelect: true + priority: 1 + protocolVersions: + - openAI + modelSizeRange: + min: 100B + max: 115B + engineConfig: + annotations: + prometheus.io/scrape: "true" + prometheus.io/port: "8080" + prometheus.io/path: "/metrics" + labels: + logging-forward: enabled + tolerations: + - key: "nvidia.com/gpu" + operator: "Exists" + effect: "NoSchedule" + volumes: + - name: dshm + emptyDir: + medium: Memory + runner: + name: ome-container + image: ghcr.io/lightseekorg/smg:1.3.2-vllm-v0.17.0 + ports: + - containerPort: 8080 + name: http1 + protocol: TCP + command: + - python3 + - -m + - vllm.entrypoints.openai.api_server + - --port + - "8080" + - --model + - $(MODEL_PATH) + - --tensor-parallel-size + - "4" + - --gpu-memory-utilization + - "0.9" + - --enable-auto-tool-choice + - --tool-call-parser + - glm45 + volumeMounts: + - mountPath: /dev/shm + name: dshm + resources: + requests: + cpu: 20 + memory: 200Gi + nvidia.com/gpu: 4 + limits: + cpu: 20 + memory: 200Gi + nvidia.com/gpu: 4 + + readinessProbe: + httpGet: + path: /health + port: 8080 + failureThreshold: 3 + successThreshold: 1 + periodSeconds: 90 + timeoutSeconds: 20 + + livenessProbe: + httpGet: + path: /health + port: 8080 + failureThreshold: 5 + successThreshold: 1 + periodSeconds: 60 + timeoutSeconds: 60 + + startupProbe: + httpGet: + path: /health + port: 8080 + failureThreshold: 150 + successThreshold: 1 + periodSeconds: 6 + initialDelaySeconds: 60 + timeoutSeconds: 30 diff --git a/config/samples/isvc/MiniMaxAI/minimax-m2-5.yaml b/config/samples/isvc/MiniMaxAI/minimax-m2-5.yaml new file mode 100644 index 00000000..e458bd27 --- /dev/null +++ b/config/samples/isvc/MiniMaxAI/minimax-m2-5.yaml @@ -0,0 +1,17 @@ +--- +apiVersion: v1 +kind: Namespace +metadata: + name: minimax +--- +apiVersion: ome.io/v1beta1 +kind: InferenceService +metadata: + name: minimax-m2-5 + namespace: minimax +spec: + model: + name: minimax-m2-5 + engine: + minReplicas: 1 + maxReplicas: 1 diff --git a/config/samples/isvc/mistralai/devstral-2-123b-instruct-2512.yaml b/config/samples/isvc/mistralai/devstral-2-123b-instruct-2512.yaml new file mode 100644 index 00000000..b03e357d --- /dev/null +++ b/config/samples/isvc/mistralai/devstral-2-123b-instruct-2512.yaml @@ -0,0 +1,17 @@ +--- +apiVersion: v1 +kind: Namespace +metadata: + name: mistralai +--- +apiVersion: ome.io/v1beta1 +kind: InferenceService +metadata: + name: devstral-2-123b-instruct-2512 + namespace: mistralai +spec: + model: + name: devstral-2-123b-instruct-2512 + engine: + minReplicas: 1 + maxReplicas: 1 diff --git a/config/samples/isvc/mistralai/devstral-small-2-24b-instruct-2512.yaml b/config/samples/isvc/mistralai/devstral-small-2-24b-instruct-2512.yaml new file mode 100644 index 00000000..f81c89da --- /dev/null +++ b/config/samples/isvc/mistralai/devstral-small-2-24b-instruct-2512.yaml @@ -0,0 +1,17 @@ +--- +apiVersion: v1 +kind: Namespace +metadata: + name: mistralai +--- +apiVersion: ome.io/v1beta1 +kind: InferenceService +metadata: + name: devstral-small-2-24b-instruct-2512 + namespace: mistralai +spec: + model: + name: devstral-small-2-24b-instruct-2512 + engine: + minReplicas: 1 + maxReplicas: 1 diff --git a/config/samples/isvc/zai-org/glm-4-5v.yaml b/config/samples/isvc/zai-org/glm-4-5v.yaml index a097215c..780a4e31 100644 --- a/config/samples/isvc/zai-org/glm-4-5v.yaml +++ b/config/samples/isvc/zai-org/glm-4-5v.yaml @@ -1,11 +1,17 @@ +--- +apiVersion: v1 +kind: Namespace +metadata: + name: glm +--- apiVersion: ome.io/v1beta1 kind: InferenceService metadata: - name: glm-4-5v-test - namespace: zai-org-test + name: glm-4-5v + namespace: glm spec: - modelRef: + model: name: glm-4-5v - minReplicas: 1 - maxReplicas: 1 - scaleTarget: 1 + engine: + minReplicas: 1 + maxReplicas: 1 diff --git a/config/samples/isvc/zai-org/glm-4-6-fp8.yaml b/config/samples/isvc/zai-org/glm-4-6-fp8.yaml new file mode 100644 index 00000000..93a6fbb5 --- /dev/null +++ b/config/samples/isvc/zai-org/glm-4-6-fp8.yaml @@ -0,0 +1,17 @@ +--- +apiVersion: v1 +kind: Namespace +metadata: + name: glm +--- +apiVersion: ome.io/v1beta1 +kind: InferenceService +metadata: + name: glm-4-6-fp8 + namespace: glm +spec: + model: + name: glm-4-6-fp8 + engine: + minReplicas: 1 + maxReplicas: 1 diff --git a/config/samples/isvc/zai-org/glm-4-7-fp8.yaml b/config/samples/isvc/zai-org/glm-4-7-fp8.yaml new file mode 100644 index 00000000..6e7a2d69 --- /dev/null +++ b/config/samples/isvc/zai-org/glm-4-7-fp8.yaml @@ -0,0 +1,17 @@ +--- +apiVersion: v1 +kind: Namespace +metadata: + name: glm +--- +apiVersion: ome.io/v1beta1 +kind: InferenceService +metadata: + name: glm-4-7-fp8 + namespace: glm +spec: + model: + name: glm-4-7-fp8 + engine: + minReplicas: 1 + maxReplicas: 1