vllm-project · max-wittig · Apr 16, 2026 · Apr 23, 2026
diff --git a/src/vllm_router/README.md b/src/vllm_router/README.md
@@ -29,6 +29,7 @@ The router can be configured using command-line arguments. Below are the availab
 - `--static-models`: The models running in the static serving engines, separated by commas (e.g., `model1,model2`).
 - `--static-aliases`: The aliases of the models running in the static serving engines, separated by commas and associated using colons (e.g., `model_alias1:model,mode_alias2:model`).
 - `--static-backend-health-checks`: Enable this flag to make vllm-router check periodically if the models work by sending dummy requests to their endpoints.
+- `--static-healthcheck-disabled`: Disable health checks per backend, separated by commas (e.g., `true,false,true`). When set, backends with `true` will be excluded from periodic health checks while still being routed to.
 - `--k8s-port`: The port of vLLM processes when using K8s service discovery. Default is `8000`.
 - `--k8s-namespace`: The namespace of vLLM pods when using K8s service discovery. Default is `default`.
 - `--k8s-label-selector`: The label selector to filter vLLM pods when using K8s service discovery.
@@ -108,6 +109,45 @@ different endpoints for each model type.
 > Enabling this flag will put some load on your backend every minute as real requests are send to the nodes
 > to test their functionality.
 
+### Disabling health checks for specific models
+
+You can disable health checks for individual models while keeping them enabled globally.
+This is useful when certain models do not support the test payloads used by the health
+check, or when you want to avoid the overhead on specific backends.
+
+**In a YAML config file**, add `healthcheck_disabled: true` to any model entry:
+
+```yaml
+static_backend_health_checks: true
+static_models:
+    meta-llama/Llama-3.1-8B-Instruct:
+        static_backends:
+            - http://localhost:9001
+        static_model_type: chat
+    my-custom-model:
+        static_backends:
+            - http://localhost:9002
+        static_model_type: completion
+        healthcheck_disabled: true   # this model will not be health-checked
+```
+
+**Via CLI**, use the `--static-healthcheck-disabled` flag with a comma-separated list of
+`true`/`false` values that correspond positionally to each backend:
+
+```bash
+vllm-router --port 8000 \
+    --service-discovery static \
+    --static-backends "http://localhost:9001,http://localhost:9002" \
+    --static-models "meta-llama/Llama-3.1-8B-Instruct,my-custom-model" \
+    --static-model-types "chat,completion" \
+    --static-backend-health-checks \
+    --static-healthcheck-disabled "false,true" \
+    --routing-logic roundrobin
+```
+
+Models with health checks disabled will still be routed to normally, but they are
+skipped during the periodic health check loop and will never be marked as unhealthy by it.
+
 ## Dynamic Router Config
 
 The router can be configured dynamically using a config file when passing the `--dynamic-config-yaml` or
@@ -128,6 +168,7 @@ Currently, the dynamic config supports the following fields:
 - (When using `static` service discovery) `static_models`: The models running in the static serving engines, separated by commas (e.g., `model1,model2`).
 - (When using `static` service discovery) `static_aliases`: The aliases of the models running in the static serving engines, separated by commas and associated using colons (e.g., `model_alias1:model,mode_alias2:model`).
 - (When using `static` service discovery and if you enable the `--static-backend-health-checks` flag) `static_model_types`: The model types running in the static serving engines, separated by commas (e.g., `chat,chat`).
+- (When using `static` service discovery) `healthcheck_disabled`: A per-model boolean in the YAML config (under each model entry) that excludes the model from periodic health checks. Defaults to `false`.
 - (When using `k8s` service discovery) `k8s_port`: The port of vLLM processes when using K8s service discovery. Default is `8000`.
 - (When using `k8s` service discovery) `k8s_namespace`: The namespace of vLLM pods when using K8s service discovery. Default is `default`.
 - (When using `k8s` service discovery) `k8s_label_selector`: The label selector to filter vLLM pods when using K8s service discovery.
@@ -139,6 +180,7 @@ Here is an example of a dynamic YAML config file:
 service_discovery: static
 routing_logic: roundrobin
 callbacks: module.custom.callback_handler
+static_backend_health_checks: true
 static_models:
     facebook/opt-125m:
         static_backends:
@@ -149,6 +191,7 @@ static_models:
         static_backends:
             - http://localhost:9002
         static_model_type: chat
+        healthcheck_disabled: true
 static_aliases:
     "my-alias": "facebook/opt-125m"
     "my-other-alias": "meta-llama/Llama-3.1-8B-Instruct"
@@ -164,7 +207,9 @@ Here is an example of a dynamic JSON config file:
     "static_backends": "http://localhost:9001,http://localhost:9002,http://localhost:9003",
     "static_models": "facebook/opt-125m,meta-llama/Llama-3.1-8B-Instruct,facebook/opt-125m",
     "static_model_types": "completion,chat,completion",
-    "static_aliases": "my-alias:meta-llama/Llama-3.1-8B-Instruct,my-other-alias:meta-llama/Llama-3.1-8B-Instruct"
+    "static_aliases": "my-alias:meta-llama/Llama-3.1-8B-Instruct,my-other-alias:meta-llama/Llama-3.1-8B-Instruct",
+    "static_backend_health_checks": true,
+    "static_healthcheck_disabled": "false,true,false"
 }
 ```
 

diff --git a/src/vllm_router/app.py b/src/vllm_router/app.py
@@ -216,6 +216,11 @@ def initialize_all(app: FastAPI, args):
                 if args.static_model_labels
                 else None
             ),
+            healthcheck_disabled=(
+                parse_comma_separated_args(args.static_healthcheck_disabled)
+                if args.static_healthcheck_disabled
+                else None
+            ),
             static_backend_health_checks=args.static_backend_health_checks,
             static_backend_health_check_interval=args.static_backend_health_check_interval,
             static_backend_health_check_timeout_seconds=args.static_backend_health_check_timeout_seconds,

diff --git a/src/vllm_router/dynamic_config.py b/src/vllm_router/dynamic_config.py
@@ -57,6 +57,7 @@ class DynamicRouterConfig:
     static_aliases: Optional[str] = None
     static_model_labels: Optional[str] = None
     static_model_types: Optional[str] = None
+    static_healthcheck_disabled: Optional[str] = None
     static_backend_health_checks: Optional[bool] = False
     static_backend_health_check_interval: Optional[int] = 60
     static_backend_health_check_timeout_seconds: Optional[int] = 10
@@ -97,6 +98,9 @@ def from_args(args) -> "DynamicRouterConfig":
             static_backend_health_checks=args.static_backend_health_checks,
             static_backend_health_check_interval=args.static_backend_health_check_interval,
             static_backend_health_check_timeout_seconds=args.static_backend_health_check_timeout_seconds,
+            static_healthcheck_disabled=getattr(
+                args, "static_healthcheck_disabled", None
+            ),
             k8s_port=args.k8s_port,
             k8s_namespace=args.k8s_namespace,
             k8s_label_selector=args.k8s_label_selector,
@@ -176,6 +180,11 @@ def reconfigure_service_discovery(self, config: DynamicRouterConfig):
                 ),
                 model_labels=parse_comma_separated_args(config.static_model_labels),
                 model_types=parse_comma_separated_args(config.static_model_types),
+                healthcheck_disabled=(
+                    parse_comma_separated_args(config.static_healthcheck_disabled)
+                    if config.static_healthcheck_disabled
+                    else None
+                ),
                 static_backend_health_checks=config.static_backend_health_checks,
                 static_backend_health_check_interval=config.static_backend_health_check_interval,
                 static_backend_health_check_timeout_seconds=config.static_backend_health_check_timeout_seconds,

diff --git a/src/vllm_router/parsers/parser.py b/src/vllm_router/parsers/parser.py
@@ -178,6 +178,13 @@ def parse_args():
         default=None,
         help="The model labels of static backends, separated by commas. E.g., model1,model2",
     )
+    parser.add_argument(
+        "--static-healthcheck-disabled",
+        type=str,
+        default=None,
+        help="Disable healthcheck per backend, separated by commas. E.g., true,false,true. "
+        "When set, backends with 'true' will be excluded from periodic health checks.",
+    )
     parser.add_argument(
         "--static-backend-health-checks",
         action="store_true",

diff --git a/src/vllm_router/parsers/yaml_utils.py b/src/vllm_router/parsers/yaml_utils.py
@@ -37,6 +37,15 @@ def generate_static_model_types(models: dict[str, Any]) -> str:
     return ",".join(static_model_types)
 
 
+def generate_static_healthcheck_disabled(models: dict[str, Any]) -> str:
+    healthcheck_disabled = []
+    for _, details in models.items():
+        if "static_backends" in details:
+            disabled = str(details.get("healthcheck_disabled", False)).lower()
+            healthcheck_disabled.extend([disabled] * len(details["static_backends"]))
+    return ",".join(healthcheck_disabled)
+
+
 def read_and_process_yaml_config_file(config_path: str) -> dict[str, Any]:
     with open(config_path, encoding="utf-8") as f:
         try:
@@ -49,6 +58,9 @@ def read_and_process_yaml_config_file(config_path: str) -> dict[str, Any]:
                 yaml_config["static_backends"] = generate_static_backends(models)
                 yaml_config["static_models"] = generate_static_models(models)
                 yaml_config["static_model_types"] = generate_static_model_types(models)
+                yaml_config["static_healthcheck_disabled"] = (
+                    generate_static_healthcheck_disabled(models)
+                )
             if aliases:
                 yaml_config["static_aliases"] = generate_static_aliases(aliases)
             return yaml_config

diff --git a/src/vllm_router/service_discovery.py b/src/vllm_router/service_discovery.py
@@ -229,6 +229,7 @@ def __init__(
         aliases: List[str] | None = None,
         model_labels: List[str] | None = None,
         model_types: List[str] | None = None,
+        healthcheck_disabled: List[str] | None = None,
         static_backend_health_checks: bool = False,
         static_backend_health_check_interval: int = 60,
         static_backend_health_check_timeout_seconds: int = 10,
@@ -242,6 +243,7 @@ def __init__(
         self.aliases = aliases
         self.model_labels = model_labels
         self.model_types = model_types
+        self.healthcheck_disabled = healthcheck_disabled
         self.engines_id = [str(uuid.uuid4()) for i in range(0, len(urls))]
         self.added_timestamp = int(time.time())
         self.unhealthy_endpoint_hashes = []
@@ -256,9 +258,19 @@ def __init__(
     def get_unhealthy_endpoint_hashes(self) -> list[str]:
         unhealthy_endpoints = []
         try:
-            for url, model, model_type in zip(
-                self.urls, self.models, self.model_types, strict=True
+            for i, (url, model, model_type) in enumerate(
+                zip(self.urls, self.models, self.model_types or [], strict=True)
             ):
+                if (
+                    self.healthcheck_disabled
+                    and i < len(self.healthcheck_disabled)
+                    and self.healthcheck_disabled[i].lower() == "true"
+                ):
+                    logger.debug(
+                        f"Skipping health check for {model} at {url} "
+                        "(healthcheck disabled)"
+                    )
+                    continue
                 if utils.is_model_healthy(
                     url, model, model_type, self.health_check_timeout
                 ):