-
Notifications
You must be signed in to change notification settings - Fork 87
Expand file tree
/
Copy pathrender_plans.py
More file actions
1401 lines (1185 loc) · 55.8 KB
/
render_plans.py
File metadata and controls
1401 lines (1185 loc) · 55.8 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
"""Render Jinja2 templates into per-stack YAML plans.
Loads templates, merges defaults with scenario overrides, resolves
versions and cluster resources, and writes validated YAML to the output dir.
"""
import base64
import hashlib
import json
import os
import re
from copy import deepcopy
from pathlib import Path
from typing import Optional, Any
import yaml
from jinja2 import Environment, TemplateSyntaxError, UndefinedError
from llmdbenchmark.config import config
from llmdbenchmark.logging.logger import get_logger
from llmdbenchmark.parser.config_schema import validate_config
from llmdbenchmark.parser.render_result import StackErrors, RenderResult
class RenderPlans:
"""Render and validate llmdbenchmark stack plans from Jinja2 templates.
Templates prefixed with ``_`` are treated as macros/partials and not
rendered directly. All others are rendered per stack with merged values.
"""
# Prefix for partial/macro files (not rendered directly)
PARTIAL_PREFIX = "_"
# Default namespace when "auto" is specified (matches original bash: llmdbench)
DEFAULT_NAMESPACE = "llmdbench"
def __init__(
self,
template_dir: Path,
defaults_file: Path,
scenarios_file: Path,
output_dir: Path,
logger=None,
version_resolver=None,
cluster_resource_resolver=None,
cli_namespace: str | None = None,
cli_model: str | None = None,
cli_methods: str | None = None,
cli_monitoring: bool | None = None,
cli_wva: bool = False,
cli_gateway_class: str | None = None,
setup_overrides: dict | None = None,
cli_stack_filter: list[str] | None = None,
):
self.template_dir = Path(template_dir)
self.defaults_file = Path(defaults_file)
self.scenarios_file = Path(scenarios_file)
self.output_dir = Path(output_dir)
self.version_resolver = version_resolver
self.cluster_resource_resolver = cluster_resource_resolver
self.cli_namespace = cli_namespace
self.cli_model = cli_model
self.cli_methods = cli_methods
self.cli_monitoring = cli_monitoring
self.cli_wva = cli_wva
# CLI override for `gateway.className`. Applied per-stack in
# `_resolve_gateway_class` ahead of `_validate_epponly_constraints`
# so the validator sees the post-override value. Only affects
# rendering on the modelservice path; ignored by kustomize/standalone/fma.
self.cli_gateway_class = cli_gateway_class
self.setup_overrides = setup_overrides
# When --stack selects exactly one stack, -m/--models scopes to
# that stack only (sibling stacks keep their scenario-defined
# models). When --stack isn't set or selects multiple stacks and
# the scenario is multi-stack, -m applies to every stack with a
# warning. See _resolve_model.
self.cli_stack_filter: list[str] = list(cli_stack_filter or [])
# Latched flag so the "-m applies to every stack" warning in
# _resolve_model fires once per RenderPlans instance, not N times
# in a multi-stack scenario.
self._cli_model_multi_stack_warned: bool = False
self.logger = logger or get_logger(
config.log_dir, verbose=config.verbose, log_name=__name__
)
# Cache for parsed templates (avoid re-parsing on multiple evals)
self._template_cache: Optional[list[dict]] = None
# Jinja2 environment (reusable)
self._jinja_env: Optional[Environment] = None
def _get_jinja_env(self) -> Environment:
"""Get or create the Jinja2 environment with custom filters."""
if self._jinja_env is not None:
return self._jinja_env
env = Environment(
autoescape=False,
trim_blocks=True,
lstrip_blocks=True,
keep_trailing_newline=False,
)
# Register custom filters
env.filters["indent"] = self._indent_filter
env.filters["toyaml"] = self._toyaml_filter
env.filters["tojson"] = self._tojson_filter
env.filters["is_empty"] = self._is_empty_filter
env.filters["default_if_empty"] = self._default_if_empty_filter
env.filters["b64pad"] = self._b64pad_filter
env.filters["b64encode"] = self._b64encode_filter
env.filters["model_id_label"] = self._model_id_label_filter
self._jinja_env = env
return env
@staticmethod
def _indent_filter(text: str, width: int = 4, first: bool = False) -> str:
"""Indent text by specified width."""
if not text:
return text
lines = text.split("\n")
if first:
return "\n".join(" " * width + line if line else "" for line in lines)
if len(lines) == 1:
return text
return (
lines[0]
+ "\n"
+ "\n".join(" " * width + line if line else "" for line in lines[1:])
)
@staticmethod
def _toyaml_filter(
value: Any, indent: int = 0, default_flow_style: bool = False
) -> str:
"""Convert Python object to YAML string."""
if value is None:
return ""
if isinstance(value, str):
return value
if isinstance(value, (dict, list)) and len(value) == 0:
return ""
result = yaml.dump(
value, default_flow_style=default_flow_style, allow_unicode=True
).rstrip()
if indent > 0:
lines = result.split("\n")
return "\n".join(
" " * indent + line if line.strip() else line for line in lines
)
return result
@staticmethod
def _tojson_filter(value: Any) -> str:
"""Convert Python object to compact JSON string."""
if value is None:
return "null"
return json.dumps(value, separators=(",", ":"))
@staticmethod
def _is_empty_filter(value: Any) -> bool:
"""Check if value is empty (None, empty string, empty dict/list)."""
if value is None:
return True
if isinstance(value, str) and not value.strip():
return True
if isinstance(value, (dict, list)) and len(value) == 0:
return True
return False
@staticmethod
def _default_if_empty_filter(value: Any, default_value: Any) -> Any:
"""Return default value if value is empty."""
if RenderPlans._is_empty_filter(value):
return default_value
return value
@staticmethod
def _b64pad_filter(value: str) -> str:
"""Ensure a base64 string has proper padding.
Base64 strings must have length divisible by 4. If not,
append '=' characters to reach the next multiple of 4.
This fixes 'illegal base64 data' errors from Kubernetes.
"""
if not value or not isinstance(value, str):
return value
value = value.strip()
# Add padding to make length a multiple of 4
remainder = len(value) % 4
if remainder:
value += "=" * (4 - remainder)
return value
@staticmethod
def _b64encode_filter(value: str) -> str:
"""Base64-encode a plain-text string.
Useful for creating Kubernetes Secret data fields from plain text.
"""
if not value or not isinstance(value, str):
return value
return base64.b64encode(value.encode("utf-8")).decode("utf-8")
@staticmethod
def _model_id_label_filter(model_name: str, namespace: str = "") -> str:
"""Generate a hashed model ID label matching the bash implementation.
Takes a model name like 'Qwen/Qwen3-32B' and a namespace, produces
a DNS-safe label in the format: {first8}-{hash8}-{last8}.
This matches the bash model_attribute() function in setup/functions.py.
"""
if not model_name:
return model_name
model_id = model_name.replace("/", "-").replace(".", "-")
hash_input = f"{namespace}/{model_id}" if namespace else model_id
digest = hashlib.sha256(hash_input.encode("utf-8")).hexdigest()
label = f"{model_id[:8]}-{digest[:8]}-{model_id[-8:]}"
return label.lower()
def _load_yaml(self, yaml_file: Path) -> dict:
"""Load and parse a YAML file, raising on missing file or invalid syntax."""
if not yaml_file.exists():
raise FileNotFoundError(f"YAML file not found: {yaml_file}")
with open(yaml_file, "r", encoding="utf-8") as f:
return yaml.full_load(f)
def deep_merge(self, base: dict, override: dict) -> dict:
"""Deep-merge two dicts; override values take precedence. Returns a new dict."""
result = deepcopy(base)
for key, value in override.items():
if value is None:
continue # YAML key with no value -- don't clobber defaults
if (
key in result
and isinstance(result[key], dict)
and isinstance(value, dict)
):
result[key] = self.deep_merge(result[key], value)
else:
result[key] = deepcopy(value)
return result
def _apply_resource_preset(self, values: dict) -> dict:
"""Merge the named resource preset into decode/prefill configs if specified."""
preset_name = values.get("resourcePreset")
if not preset_name:
return values
presets = values.get("resourcePresets", {})
if preset_name not in presets:
self.logger.log_warning(
f"Resource preset '{preset_name}' not found, skipping..."
)
return values
preset = presets[preset_name]
result = deepcopy(values)
# Apply preset to decode and prefill
for component in ("decode", "prefill"):
if component in preset:
result[component] = self.deep_merge(
result.get(component, {}), preset[component]
)
self.logger.log_info(f"Applied resource preset: {preset_name}")
return result
def _resolve_namespace(self, values: dict) -> dict:
"""Resolve namespace config from CLI override or ``"auto"`` default.
Handles comma-separated ``deploy,harness,wva`` from ``--namespace``.
"""
result = deepcopy(values)
ns_config = result.get("namespace", {})
current_name = ns_config.get("name", "auto")
if self.cli_namespace:
parts = [p.strip() for p in self.cli_namespace.split(",")]
deploy_ns = parts[0] if parts else current_name
harness_ns = parts[1] if len(parts) > 1 and parts[1] else deploy_ns
wva_ns = parts[2] if len(parts) > 2 and parts[2] else deploy_ns
if deploy_ns == "auto":
deploy_ns = self.DEFAULT_NAMESPACE
if harness_ns == "auto":
harness_ns = deploy_ns
if wva_ns == "auto":
wva_ns = deploy_ns
ns_config["name"] = deploy_ns
result["namespace"] = ns_config
gw_config = result.get("gateway", {})
if gw_config.get("namespace") in ("auto", self.DEFAULT_NAMESPACE, ""):
gw_config["namespace"] = deploy_ns
result["gateway"] = gw_config
harness_config = result.get("harness", {})
harness_config["namespace"] = harness_ns
result["harness"] = harness_config
wva_config = result.get("wva", {})
wva_config["namespace"] = wva_ns
result["wva"] = wva_config
self.logger.log_info(
f"Namespace from CLI: deploy={deploy_ns}, "
f"harness={harness_ns}, wva={wva_ns}"
)
elif current_name == "auto":
ns_config["name"] = self.DEFAULT_NAMESPACE
result["namespace"] = ns_config
gw_config = result.get("gateway", {})
if gw_config.get("namespace") in ("auto", self.DEFAULT_NAMESPACE, ""):
gw_config["namespace"] = self.DEFAULT_NAMESPACE
result["gateway"] = gw_config
self.logger.log_info(
f'Namespace "auto" resolved to "{self.DEFAULT_NAMESPACE}"'
)
return result
@staticmethod
def _generate_short_name(model_id: str, namespace: str = "llmdbench") -> str:
"""Generate a K8s-safe short name from a HuggingFace model ID.
Follows the bash reference pattern::
{first_8_chars}-{sha256_first_8}-{last_8_chars}
Where *chars* come from the normalised model ID (``/`` to ``-``,
``.`` to ``-``). The hash is the SHA-256 of
``{namespace}/{normalised_model_id}``.
The result is lowercased so it is valid as a K8s resource name
(DNS subdomain: ``[a-z0-9-]``).
"""
normalised = model_id.replace("/", "-").replace(".", "-")
hash_input = f"{namespace}/{normalised}"
digest = hashlib.sha256(hash_input.encode("utf-8")).hexdigest()
first8 = normalised[:8]
last8 = normalised[-8:]
hash8 = digest[:8]
return f"{first8}-{hash8}-{last8}".lower()
def _resolve_model(
self,
values: dict,
total_stacks: int = 1,
stack_name: str = "",
) -> dict:
"""Resolve model configuration from CLI ``--models`` override.
When the user passes ``-m <model>`` on the command line the model
fields in the merged values dict are updated:
- ``model.name`` -- the HuggingFace model ID
- ``model.huggingfaceId`` -- same as name
- ``model.path`` -- ``models/<model_id>``
- ``model.shortName`` -- auto-generated K8s-safe label
The ``shortName`` is derived from the model ID and the already-
resolved namespace (``_resolve_namespace`` must run first).
Multi-stack scoping rules:
1. Single-stack scenario -> apply unconditionally (normal override).
2. Multi-stack + ``--stack NAME`` selecting exactly one stack ->
apply to that stack only; sibling stacks keep their
scenario-defined models.
3. Multi-stack with no filter (or filter selecting >1 stack) ->
apply to every stack and emit a warning, because the same
model across N stacks collapses the scenario into N copies.
Rule 2 is the common case operators want: "rerun pool-a against
a different model," without touching pool-b.
"""
if not self.cli_model:
return values
# Rule 2: filter narrows to exactly one stack - skip non-matching
# stacks entirely so their scenario-defined models survive.
filter_len = len(self.cli_stack_filter)
if total_stacks > 1 and filter_len == 1:
if stack_name != self.cli_stack_filter[0]:
return values
# Matching stack: apply silently (operator explicitly scoped).
# Rule 3: multi-stack with a broad (or missing) filter -> warn once.
elif total_stacks > 1 and not self._cli_model_multi_stack_warned:
self.logger.log_warning(
f"-m/--models={self.cli_model!r} is applied identically "
f"to all {total_stacks} stack(s). In a multi-model scenario "
"this replaces every stack's model with the same value, "
"which collapses the scenario into N copies of one model. "
"To scope -m to a single stack, combine with --stack <name>; "
"to benchmark a pre-existing pool, drop -m entirely and "
"let --stack <name> auto-resolve the endpoint."
)
self._cli_model_multi_stack_warned = True
result = deepcopy(values)
model_config = result.get("model", {})
model_id = self.cli_model
model_config["name"] = model_id
model_config["huggingfaceId"] = model_id
model_config["path"] = f"models/{model_id}"
# Derive short name using the already-resolved namespace
namespace = result.get("namespace", {}).get("name", self.DEFAULT_NAMESPACE)
model_config["shortName"] = self._generate_short_name(model_id, namespace)
result["model"] = model_config
suffix = f" [stack={stack_name}]" if stack_name else ""
self.logger.log_info(
f"Model from CLI: {model_id} "
f"(shortName={model_config['shortName']}){suffix}"
)
return result
def _warn_custom_command_conflicts(self, values: dict) -> None:
"""Warn when CLI overrides won't propagate into hardcoded customCommands.
customCommand is a verbatim string -- CLI flags like --models only
update the config dict (model.name, etc.) but cannot modify the
hardcoded values inside customCommand. Emit a warning so users
know to update the customCommand manually.
"""
if not self.cli_model:
return
for role in ("decode", "prefill"):
cmd = values.get(role, {}).get("vllm", {}).get("customCommand")
if cmd:
self.logger.log_warning(
f"CLI --models override ({self.cli_model}) will not "
f"propagate into {role}.vllm.customCommand. "
f"Update the customCommand in your scenario to match, "
f"or remove customCommand to use the auto-generated command."
)
def _resolve_monitoring(self, values: dict) -> dict:
"""Override monitoring based on ``--monitoring`` / ``--no-monitoring``.
When enabled (``--monitoring``):
- ``podmonitor.enabled`` → PodMonitor CRDs created for Prometheus
- ``metricsScrapeEnabled`` → harness scrapes vLLM /metrics during run
When disabled (``--no-monitoring``):
- ``podmonitor.enabled`` → False (no PodMonitor created)
When neither flag is given, scenario/defaults values are used
(podmonitor enabled by default, metricsScrapeEnabled disabled).
"""
if self.cli_monitoring is None:
return values
result = deepcopy(values)
monitoring_config = result.setdefault("monitoring", {})
podmonitor_config = monitoring_config.setdefault("podmonitor", {})
if self.cli_monitoring:
podmonitor_config["enabled"] = True
monitoring_config["metricsScrapeEnabled"] = True
self.logger.log_info(
"Monitoring enabled from CLI: PodMonitor + metrics scraping"
)
else:
podmonitor_config["enabled"] = False
ie = result.setdefault("inferenceExtension", {})
ie_mon = ie.setdefault("monitoring", {})
ie_prom = ie_mon.setdefault("prometheus", {})
ie_prom["enabled"] = False
self.logger.log_info(
"Monitoring disabled from CLI (--no-monitoring): "
"PodMonitor and GAIE ServiceMonitor will not be created"
)
return result
def _resolve_wva(self, values: dict) -> dict:
"""Enable the Workload Variant Autoscaler when ``-u/--wva`` is set."""
if not self.cli_wva:
return values
result = deepcopy(values)
wva_config = result.setdefault("wva", {})
wva_config["enabled"] = True
self.logger.log_info("Workload Variant Autoscaler enabled from CLI")
return result
def _resolve_deploy_method(self, values: dict) -> dict:
"""Override deploy method based on CLI ``--methods`` flag.
Accepts ``--methods standalone``, ``--methods modelservice``,
``--methods fma`` or ``--methods kustomize``.
Only one method may be active at a time.
Without ``--methods``, the scenario YAML value is used as-is.
"""
if not self.cli_methods:
return values
result = deepcopy(values)
methods = [m.strip() for m in self.cli_methods.split(",")]
if "standalone" in methods and "modelservice" in methods:
self.logger.log_warning(
"Cannot enable both standalone and modelservice -- "
"choose one. Using modelservice."
)
methods = ["modelservice"]
if "standalone" in methods and "fma" in methods:
self.logger.log_warning(
"Cannot enable both standalone and fma -- choose one. Using standalone."
)
methods = ["standalone"]
if "modelservice" in methods and "fma" in methods:
self.logger.log_warning(
"Cannot enable both modelservice and fma -- "
"choose one. Using modelservice."
)
methods = ["modelservice"]
if "kustomize" in methods and any(
m in methods for m in ("standalone", "modelservice", "fma")
):
self.logger.log_warning(
"Cannot combine kustomize with another deploy method -- "
"choose one. Using kustomize."
)
methods = ["kustomize"]
standalone_config = result.setdefault("standalone", {})
modelservice_config = result.setdefault("modelservice", {})
fma_config = result.setdefault("fma", {})
kustomize_config = result.setdefault("kustomize", {})
if "standalone" in methods:
standalone_config["enabled"] = True
modelservice_config["enabled"] = False
fma_config["enabled"] = False
kustomize_config["enabled"] = False
self.logger.log_info("Deploy method from CLI: standalone")
elif "modelservice" in methods:
standalone_config["enabled"] = False
modelservice_config["enabled"] = True
fma_config["enabled"] = False
kustomize_config["enabled"] = False
self.logger.log_info("Deploy method from CLI: modelservice")
elif "fma" in methods:
standalone_config["enabled"] = False
modelservice_config["enabled"] = False
fma_config["enabled"] = True
kustomize_config["enabled"] = False
self.logger.log_info("Deploy method from CLI: fma")
elif "kustomize" in methods:
standalone_config["enabled"] = False
modelservice_config["enabled"] = False
fma_config["enabled"] = False
kustomize_config["enabled"] = True
self.logger.log_info("Deploy method from CLI: kustomize")
return result
# Whitelist for the --gateway-class CLI override. Reject anything else
# at render time so a typo doesn't silently produce a broken Gateway /
# InferencePool chart configuration.
_SUPPORTED_GATEWAY_CLASSES: tuple[str, ...] = (
"epponly",
"istio",
"agentgateway",
"gke",
"data-science-gateway-class",
)
def _resolve_gateway_class(self, values: dict) -> dict:
"""Apply ``--gateway-class`` CLI override to ``gateway.className``.
``gateway.className`` only affects rendering on the modelservice
path. Kustomize / standalone / fma ignore the gateway block
entirely, so we accept any string there (including sentinels like
``none`` that CI scripts pass uniformly across deploy methods)
without validation.
On the modelservice path we enforce a whitelist so a typo fails
fast at plan time rather than silently producing a broken Gateway
/ InferencePool chart configuration.
"""
if not self.cli_gateway_class:
return values
candidate = self.cli_gateway_class.strip()
modelservice_enabled = (values.get("modelservice") or {}).get("enabled", True)
if not modelservice_enabled:
# Non-modelservice deploy method is active -- the gateway
# block is ignored by every rendered template. Store the
# value verbatim (so the banner / config.yaml are honest about
# what the CLI requested) and skip validation.
result = deepcopy(values)
gateway_config = result.setdefault("gateway", {})
gateway_config["className"] = candidate
self.logger.log_info(
f"Gateway class from CLI: {candidate} "
f"(ignored -- modelservice is not the active deploy method)"
)
return result
if candidate not in self._SUPPORTED_GATEWAY_CLASSES:
supported = ", ".join(self._SUPPORTED_GATEWAY_CLASSES)
raise ValueError(
f"--gateway-class={candidate!r} is not a supported value "
f"for the modelservice deploy method. "
f"Choose one of: {supported}."
)
result = deepcopy(values)
gateway_config = result.setdefault("gateway", {})
previous = gateway_config.get("className")
gateway_config["className"] = candidate
if previous and previous != candidate:
self.logger.log_info(
f"Gateway class override from CLI: {previous} -> {candidate}"
)
else:
self.logger.log_info(f"Gateway class from CLI: {candidate}")
return result
@staticmethod
def _validate_epponly_constraints(
values: dict,
total_stacks: int,
stack_name: str,
) -> list[str]:
"""Reject incompatible options when ``gateway.className == 'epponly'``.
``epponly`` is the llm-d "standalone" router topology: no Kubernetes
Gateway is deployed and the EPP runs with an Envoy sidecar that
serves HTTP directly. The setting only takes effect on the
``modelservice`` deploy path (it controls how the GAIE Helm chart is
wired). Some scenario features become meaningless or actively broken
when ``epponly`` is paired with modelservice:
- multi-stack scenarios (no shared Gateway / HTTPRoute -- each
stack would need its own EPP, but standup currently can't
advertise N independent EPP endpoints cleanly).
- shared HTTPRoute mode (HTTPRoute references a Gateway that
does not exist).
When a non-modelservice deploy method is active (``kustomize``,
``standalone``, ``fma``) the ``gateway.*`` block is ignored
entirely by the rendering pipeline, so we no-op rather than
flagging an error: this lets a single scenario file ship
``gateway.className: epponly`` as the modelservice default while
still being usable verbatim with ``-t kustomize`` (or any other
deploy method override).
Returns a list of fatal error strings. An empty list means the
configuration is compatible.
"""
gw_class = (values.get("gateway") or {}).get("className", "")
if gw_class != "epponly":
return []
modelservice_enabled = (values.get("modelservice") or {}).get("enabled", True)
if not modelservice_enabled:
# Another deploy method owns the stack -- gateway.className is
# a no-op for kustomize/standalone/fma, so silently accept.
return []
errors: list[str] = []
if total_stacks > 1:
errors.append(
f"[{stack_name}] gateway.className=epponly is single-stack "
"only (the standalone router topology has no shared "
"Gateway / HTTPRoute to multiplex multiple models). "
f"This scenario has {total_stacks} stacks."
)
http_route_mode = (values.get("httpRoute") or {}).get("mode")
if http_route_mode == "shared":
errors.append(
f"[{stack_name}] gateway.className=epponly cannot be used "
"with httpRoute.mode=shared (shared HTTPRoute requires a "
"Gateway that epponly does not deploy)."
)
return errors
def _log_image_overrides(self, values: dict) -> None:
"""Log images that have been explicitly set (not 'auto').
Called before version resolution so users can see which images
were pinned by the scenario or CLI rather than auto-resolved.
"""
images = values.get("images", {})
for key, img in images.items():
if isinstance(img, dict):
tag = img.get("tag", "auto")
repo = img.get("repository", "")
if tag and tag != "auto" and repo:
self.logger.log_info(
f"Image override: {key} pinned to {repo}:{tag}"
)
standalone_img = values.get("standalone", {}).get("image", {})
if isinstance(standalone_img, dict):
tag = standalone_img.get("tag", "auto")
repo = standalone_img.get("repository", "")
if tag and tag != "auto" and repo:
self.logger.log_info(
f"Image override: standalone pinned to {repo}:{tag}"
)
# Sentinel values indicating no real HF token has been configured
def _resolve_model_id_label(self, values: dict) -> dict:
"""Compute the hashed model ID label and inject it into the config.
Matches the bash model_attribute() function: takes the model name,
replaces / and . with -, then builds {first8}-{sha256_8}-{last8}.
The hash input includes the namespace for uniqueness.
"""
model = values.get("model", {})
model_name = model.get("name", "")
namespace = values.get("namespace", {}).get("name", "")
if model_name:
model_id = model_name.replace("/", "-").replace(".", "-")
hash_input = f"{namespace}/{model_id}" if namespace else model_id
digest = hashlib.sha256(hash_input.encode("utf-8")).hexdigest()
label = f"{model_id[:8]}-{digest[:8]}-{model_id[-8:]}"
values["model_id_label"] = label.lower()
else:
values["model_id_label"] = model.get("shortName", "")
model["idLabel"] = values["model_id_label"]
return values
# Defaults whose "bare" form collides across multi-stack scenarios -
# rewrite to {default}-{model_id_label} so each stack gets a uniquely
# named resource. The rewrite only fires when the config is still at
# the shipped default, so an explicit override (in ``defaults.yaml``,
# the scenario's ``shared:`` block, or a per-stack block) is
# preserved as-is.
#
# Intentionally NOT included:
# - storage.modelPvc.name - model weights share one PVC keyed by
# the per-stack `model.path`, not by the PVC name. NVMe-backed
# RWX PVCs in particular want one volume with per-model subdirs,
# not N independent volumes. The download Job name still gets
# per-stacked (below) so parallel downloads don't race.
_STACK_SCOPED_DEFAULTS: tuple[tuple[tuple[str, ...], str], ...] = (
# config path, default value that triggers the rewrite
(("downloadJob", "name"), "download-model"),
# EPP metrics-reader Secret - the gaie chart uses this to give its
# SA access to the user-workload-monitoring Prometheus. Two gaie
# Helm releases sharing this Secret name in one namespace fail
# with "owned by another helm release".
(
("inferenceExtension", "monitoring", "secretName"),
"inference-gateway-sa-metrics-reader-secret",
),
)
def _resolve_per_stack_identity(self, values: dict, total_stacks: int = 1) -> dict:
"""Auto-suffix stack-scoped resource names with ``model_id_label``.
Multi-stack scenarios need per-model PVCs, Download Jobs, and EPP
Secrets so releases / Jobs from different stacks don't collide or
race on the same Kubernetes resource. Rather than make scenario
authors remember to override every such name, we rewrite the
shipped defaults to ``{default}-{model_id_label}`` whenever the
config is still at the default.
Skipped for single-stack scenarios to keep their resource names
stable across releases - with only one stack, the collision this
resolver guards against can't happen.
See ``_STACK_SCOPED_DEFAULTS`` for the list of rewritten paths.
"""
if total_stacks < 2:
return values
label = values.get("model_id_label") or ""
if not label:
return values
for path, default in self._STACK_SCOPED_DEFAULTS:
current = self._get_nested(values, path)
if current == default:
self._set_nested(values, path, f"{default}-{label}")
return values
@staticmethod
def _get_nested(root: dict, path: tuple[str, ...]) -> Any:
"""Walk ``root`` along ``path``; return the leaf value or ``None``."""
cur: Any = root
for part in path:
if not isinstance(cur, dict) or part not in cur:
return None
cur = cur[part]
return cur
@staticmethod
def _set_nested(root: dict, path: tuple[str, ...], value: Any) -> None:
"""Walk ``root`` along ``path``, creating dicts as needed, then set."""
cur = root
for part in path[:-1]:
if part not in cur or not isinstance(cur[part], dict):
cur[part] = {}
cur = cur[part]
cur[path[-1]] = value
def _resolve_inference_pool_host(self, values: dict) -> dict:
"""Auto-populate destinationRule.host from model_id_label when not set.
The Kubernetes service name for the GAIE EPP is always
``{model_id_label}-gaie-epp``. If a scenario's
``inferenceExtension.inferencePoolProviderConfig.destinationRule``
exists but has no ``host``, fill it in automatically so that
scenario authors don't need to compute the hashed label by hand.
"""
dest_rule = (
values.get("inferenceExtension", {})
.get("inferencePoolProviderConfig", {})
.get("destinationRule")
)
if dest_rule is not None and not dest_rule.get("host"):
model_id_label = values.get("model_id_label", "")
if model_id_label:
dest_rule["host"] = f"{model_id_label}-gaie-epp"
self.logger.log_info(
f"Auto-resolved destinationRule.host to '{dest_rule['host']}'"
)
return values
# Matches ${dotted.path} but NOT ${SHELL_VAR} (no dots).
_CONFIG_VAR_RE = re.compile(r"\$\{([\w]+(?:\.[\w]+)+)\}")
def _substitute_config_variables(self, values: dict) -> dict:
"""Replace ``${dotted.path}`` references in string values with resolved config values.
Walks the config dict recursively. For every string value, substitutes
``${model.name}``-style references with the corresponding value from
the config. Shell variables like ``$VLLM_PORT`` or ``${SINGLE_WORD}``
are left untouched because the regex requires at least one dot.
"""
result = deepcopy(values)
self._substitute_recursive(result, result)
return result
def _substitute_recursive(self, node: Any, root: dict) -> None:
"""Recursively substitute config variable references in place."""
if isinstance(node, dict):
for key, value in node.items():
if isinstance(value, str):
node[key] = self._substitute_string(value, root)
elif isinstance(value, (dict, list)):
self._substitute_recursive(value, root)
elif isinstance(node, list):
for i, item in enumerate(node):
if isinstance(item, str):
node[i] = self._substitute_string(item, root)
elif isinstance(item, (dict, list)):
self._substitute_recursive(item, root)
def _substitute_string(self, text: str, root: dict) -> str:
"""Replace all ``${dotted.path}`` patterns in a single string."""
def _replace(match: re.Match) -> str:
path = match.group(1)
value = self._resolve_dotted_path(path, root)
if value is None:
self.logger.log_warning(
f"⚠️ Config variable '${{{path}}}' could not be resolved, "
"leaving as-is"
)
return match.group(0)
return str(value)
return self._CONFIG_VAR_RE.sub(_replace, text)
@staticmethod
def _resolve_dotted_path(path: str, root: dict) -> Optional[str]:
"""Resolve a dotted path like ``model.name`` against the config dict."""
current = root
for part in path.split("."):
if isinstance(current, dict) and part in current:
current = current[part]
else:
return None
if isinstance(current, (dict, list)):
return None
return current
_HF_TOKEN_SENTINELS = {"REPLACE_TOKEN", "REPLACE_TOKEN_B64", ""}
def _resolve_hf_token(self, values: dict) -> dict:
"""Auto-detect HuggingFace token and set huggingface.enabled.
When the configured ``huggingface.token`` is still a sentinel
value (``REPLACE_TOKEN`` or empty), this method checks the
following environment variables in order:
1. ``HF_TOKEN``
2. ``HUGGING_FACE_HUB_TOKEN``
If a token is found, it is injected into the values dict along
with its base64-encoded form so that rendered K8s Secret YAMLs
work correctly.
Sets ``huggingface.enabled`` to control whether HF token secrets
and auth are rendered. Public models work without a token --
the secret and auth blocks are skipped entirely. Gated models
without a token cause an immediate error.
"""
result = deepcopy(values)
hf_config = result.get("huggingface", {})
current_token = hf_config.get("token", "")
# Only auto-detect if the current token is a sentinel / empty
if current_token and current_token not in self._HF_TOKEN_SENTINELS:
hf_config["enabled"] = True
result["huggingface"] = hf_config
return result
# Check environment variables (order matches HuggingFace SDK convention)
env_token = os.environ.get("HF_TOKEN") or os.environ.get(
"HUGGING_FACE_HUB_TOKEN"
)
if not env_token:
# No token available -- disable HF secret/auth rendering.
# Public models will work fine; gated models are caught at
# standup time by the model access check.
hf_config["enabled"] = False
hf_config["token"] = ""
hf_config["tokenBase64"] = ""
result["huggingface"] = hf_config
self.logger.log_info(
"No HuggingFace token found -- HF secret will not be created. "
"Public models will work; gated models will fail at standup.",
emoji="ℹ️",
)
return result
# Inject the token and its base64-encoded form
hf_config["token"] = env_token
hf_config["tokenBase64"] = base64.b64encode(env_token.encode("utf-8")).decode(
"utf-8"
)
hf_config["enabled"] = True
result["huggingface"] = hf_config
self.logger.log_info(
"HuggingFace token detected from environment "
f"(hf_{'*' * 4}...{env_token[-4:]})",
emoji="🔑",
)
return result
def _load_templates(self) -> list[dict]:
"""Load .j2 files from the template dir, prepending shared macros."""
if self._template_cache is not None:
return self._template_cache
if not self.template_dir.exists():
raise FileNotFoundError(
f"Template directory not found: {self.template_dir}"
)
if not self.template_dir.is_dir():
raise NotADirectoryError(
f"Template path is not a directory: {self.template_dir}"
)