|
| 1 | +"""Pure cost-estimate formulas — VRAM, RAM, time, severity, model shape. |
| 2 | +
|
| 3 | +No I/O, no logging, no orchestration. Each formula docstring links to the |
| 4 | +reference it was calibrated against so a reviewer can follow each coefficient |
| 5 | +back to its source. |
| 6 | +
|
| 7 | +Conventions: |
| 8 | + * All ``*_gb`` results use the binary GiB convention (1024**3 bytes per GB) — |
| 9 | + matches the rest of the advisor's byte->GB conversions. |
| 10 | + * All ``*_hours`` results assume the GPU baseline of ~1 second per step; |
| 11 | + CPU runs pay a flat slowdown factor (see ``_time_for_transformer``). |
| 12 | + * "fp32 worst case" — we deliberately ignore lower-precision / FlashAttention / |
| 13 | + quantization optimizations, per the advisor's "pessimistic upper bound" contract. |
| 14 | +""" |
| 15 | + |
| 16 | +from __future__ import annotations |
| 17 | + |
| 18 | +from typing import TYPE_CHECKING |
| 19 | + |
| 20 | +from autointent._advisor._report import Severity |
| 21 | + |
| 22 | +if TYPE_CHECKING: |
| 23 | + from autointent._advisor._hub import ModelMeta |
| 24 | + from autointent._advisor._report import DatasetStats |
| 25 | + |
| 26 | + |
| 27 | +_BYTES_PER_GB = 1024**3 |
| 28 | +_DEFAULT_SEQ_LEN = 128 |
| 29 | + |
| 30 | +# Fallback architecture shape (BERT-base) used only when the model's actual |
| 31 | +# config.json couldn't be fetched from HF Hub — see _hub._shape_from_config. |
| 32 | +_DEFAULT_HIDDEN = 768 |
| 33 | +_DEFAULT_LAYERS = 12 |
| 34 | + |
| 35 | +_TIGHT_RATIO = 0.9 |
| 36 | +_MULTICLASS_THRESHOLD = 2 |
| 37 | + |
| 38 | + |
| 39 | +def _classify_severity(estimate: float, budget: float) -> Severity: |
| 40 | + """Map a ``(estimate, budget)`` pair onto a Severity bucket. |
| 41 | +
|
| 42 | + * AMPLE: ``estimate <= 0`` OR ``ratio < _TIGHT_RATIO`` |
| 43 | + * TIGHT: ``budget <= 0`` OR ``_TIGHT_RATIO <= ratio < 1`` |
| 44 | + * OVER: ``ratio >= 1`` |
| 45 | + """ |
| 46 | + if estimate <= 0: |
| 47 | + return Severity.AMPLE |
| 48 | + if budget <= 0: |
| 49 | + return Severity.TIGHT |
| 50 | + ratio = estimate / budget |
| 51 | + if ratio >= 1: |
| 52 | + return Severity.OVER |
| 53 | + if ratio >= _TIGHT_RATIO: |
| 54 | + return Severity.TIGHT |
| 55 | + return Severity.AMPLE |
| 56 | + |
| 57 | + |
| 58 | +def _weights_vram_for_transformer(meta: ModelMeta, mode: str) -> float: |
| 59 | + """Weight-side VRAM in GB — weights + grads + optimizer state. Excludes activations. |
| 60 | +
|
| 61 | + Returns a deliberately pessimistic upper bound, matching the advisor's |
| 62 | + "heuristic upper bound, not measurement" contract. |
| 63 | +
|
| 64 | + Modes: |
| 65 | + * ``inference``: forward only — weights + ~30% intermediate-tensor overhead. |
| 66 | + * ``lora``: frozen base + small trainable adapters + their grads/optimizer (~0.5 GB). |
| 67 | + * ``full-finetune`` (default): the textbook 4W (weights + grads + Adam m + Adam v). |
| 68 | + We use 4.5W to leave headroom for loss-scale buffers, allocator fragmentation, |
| 69 | + cuDNN workspaces, and gradient-accumulation buffers — none of which the textbook |
| 70 | + 4W accounting captures. |
| 71 | + """ |
| 72 | + weights_gb = meta.weights_gb |
| 73 | + if mode == "inference": |
| 74 | + return weights_gb * 1.3 |
| 75 | + if mode == "lora": |
| 76 | + return weights_gb * 1.3 + 0.5 |
| 77 | + return weights_gb * 4.5 |
| 78 | + |
| 79 | + |
| 80 | +def _activations_gb_per_sample( |
| 81 | + meta: ModelMeta | None, |
| 82 | + seq_len: int, |
| 83 | + *, |
| 84 | + is_training: bool, |
| 85 | +) -> float: |
| 86 | + """Heuristic activation memory per sample, assuming a fp32 worst case. |
| 87 | +
|
| 88 | + Training: ``seq_len x hidden x layers x const`` — per-layer outputs are kept |
| 89 | + for backward. |
| 90 | + Inference: ``seq_len x hidden x const`` — only one or two layers' outputs in |
| 91 | + flight at once. |
| 92 | + """ |
| 93 | + hidden = _embedder_dim(meta) |
| 94 | + # Training keeps every layer's outputs for backward -> scales x n_layers. |
| 95 | + # 16 bytes/token/layer ~ fp32 activation (4B) x ~4x backward overhead (Korthikanti et al.). |
| 96 | + # Inference only holds ~1-2 layers' outputs in flight at once. |
| 97 | + bytes_per_sample = seq_len * hidden * _n_layers(meta) * 16 if is_training else seq_len * hidden * 8 |
| 98 | + return bytes_per_sample / _BYTES_PER_GB |
| 99 | + |
| 100 | + |
| 101 | +def _vram_for_transformer( |
| 102 | + meta: ModelMeta, |
| 103 | + mode: str, |
| 104 | + *, |
| 105 | + batch_size: int = 0, |
| 106 | + seq_len: int = _DEFAULT_SEQ_LEN, |
| 107 | +) -> float: |
| 108 | + """Total VRAM in GB: weights + grads + optimizer state + activations x batch. |
| 109 | +
|
| 110 | + Activation accounting differs by mode — training keeps per-layer outputs for |
| 111 | + backward; inference only needs one or two layers in flight. |
| 112 | + """ |
| 113 | + base = _weights_vram_for_transformer(meta, mode) |
| 114 | + if batch_size <= 0: |
| 115 | + return base |
| 116 | + per_sample = _activations_gb_per_sample(meta, seq_len, is_training=mode != "inference") |
| 117 | + return base + per_sample * batch_size |
| 118 | + |
| 119 | + |
| 120 | +def _max_fitting_batch_size( |
| 121 | + *, |
| 122 | + weight_vram_gb: float, |
| 123 | + vram_budget_gb: float, |
| 124 | + per_sample_gb: float, |
| 125 | +) -> int: |
| 126 | + """Largest batch that keeps total VRAM under the AMPLE/TIGHT threshold. |
| 127 | +
|
| 128 | + Returns 0 when even the weights blow the budget. Result is rounded down to |
| 129 | + the nearest power of two |
| 130 | + """ |
| 131 | + if per_sample_gb <= 0: |
| 132 | + return 0 |
| 133 | + target_vram = vram_budget_gb * _TIGHT_RATIO |
| 134 | + available_for_activations = target_vram - weight_vram_gb |
| 135 | + if available_for_activations <= 0: |
| 136 | + return 0 |
| 137 | + return _floor_to_power_of_two(int(available_for_activations / per_sample_gb)) |
| 138 | + |
| 139 | + |
| 140 | +_CPU_SLOWDOWN_FACTOR = 50.0 |
| 141 | +"""Rough multiplier for transformer training on CPU vs. a modern GPU. |
| 142 | +
|
| 143 | +Real benchmarks vary widely (30x for small BERTs on AVX-512 boxes, 100x+ for |
| 144 | +billion-scale models on a stock laptop). A single 50x constant is a pessimistic |
| 145 | +upper bound that's good enough to make the CPU/GPU distinction visible without |
| 146 | +re-introducing the per-device tier table.""" |
| 147 | + |
| 148 | + |
| 149 | +def _time_for_transformer( |
| 150 | + *, |
| 151 | + n_trials: int, |
| 152 | + epochs: int, |
| 153 | + batch_size: int, |
| 154 | + n_samples: int, |
| 155 | + accelerator: str, |
| 156 | +) -> float: |
| 157 | + """Transformer training time in hours. |
| 158 | +
|
| 159 | + Baseline is "1 second per step" on a GPU (CUDA / MPS) — a step-count proxy, |
| 160 | + not a real wall-time calibration. CPU training pays a flat ``_CPU_SLOWDOWN_FACTOR`` |
| 161 | + so the report doesn't hide the fact that the same workload is dramatically |
| 162 | + slower without a GPU. Users should treat absolute numbers as ordering / |
| 163 | + ballpark information, not a budget. |
| 164 | + """ |
| 165 | + steps = max(1, (n_samples // max(1, batch_size))) * epochs |
| 166 | + h = (n_trials * steps) / 3600.0 |
| 167 | + if accelerator == "cpu": |
| 168 | + h *= _CPU_SLOWDOWN_FACTOR |
| 169 | + return h |
| 170 | + |
| 171 | + |
| 172 | +def _n_layers(meta: ModelMeta | None) -> int: |
| 173 | + """Layer count from the model's ``config.json``; falls back to BERT-base when absent.""" |
| 174 | + if meta is not None and meta.n_layers is not None: |
| 175 | + return meta.n_layers |
| 176 | + return _DEFAULT_LAYERS |
| 177 | + |
| 178 | + |
| 179 | +def _embedder_dim(meta: ModelMeta | None) -> int: |
| 180 | + """Hidden size from the model's ``config.json``; falls back to BERT-base when absent.""" |
| 181 | + if meta is not None and meta.hidden_size is not None: |
| 182 | + return meta.hidden_size |
| 183 | + return _DEFAULT_HIDDEN |
| 184 | + |
| 185 | + |
| 186 | +def _largest_embedder(seen_models: dict[str, ModelMeta]) -> ModelMeta | None: |
| 187 | + """Return the largest model in ``seen_models`` by parameter count, or None if empty.""" |
| 188 | + if not seen_models: |
| 189 | + return None |
| 190 | + return max(seen_models.values(), key=lambda m: m.total_params) |
| 191 | + |
| 192 | + |
| 193 | +def _ram_for_module(meta: ModelMeta, stats: DatasetStats) -> float: |
| 194 | + """RAM in GB. Loose upper bound: weights + tokenized text in memory. |
| 195 | +
|
| 196 | + Tokenized text is approximated as ``n_samples x avg_tokens x 4 bytes`` |
| 197 | + (BPE/WordPiece token ids fit in int32). The 4 bytes/token bound is tight |
| 198 | + enough for the report's purposes and intentionally ignores any preprocessing |
| 199 | + artefacts (attention masks, position ids, etc.) since they're bounded by the |
| 200 | + same factor. |
| 201 | + """ |
| 202 | + return meta.weights_gb + (stats.n_samples * stats.avg_tokens * 4) / _BYTES_PER_GB |
| 203 | + |
| 204 | + |
| 205 | +# Coefficients are dimensional (per-sample-per-feature-per-iteration seconds) |
| 206 | +# rather than empirically tuned constants — they give relative-cost ordering |
| 207 | +# across configurations and absolute ballpark wall-times. |
| 208 | +_LINEAR_CPU_S_PER_SAMPLE_FEATURE_ITER = 1e-8 |
| 209 | +_CATBOOST_CPU_S_PER_SAMPLE_FEATURE_ITER = 1e-9 |
| 210 | +_CATBOOST_GPU_SPEEDUP = 10.0 |
| 211 | +# LogisticRegressionCV defaults: Cs=10, cv=3 -> 10x3 inner fits + 1 final refit = 31. |
| 212 | +_LOGREG_CV_MULTIPLIER = 31 |
| 213 | +# Default value of `border_count` in CatBoost (number of histogram buckets per feature). |
| 214 | +_CATBOOST_DEFAULT_BINS = 254 |
| 215 | +# Bytes per histogram bucket / tree node — order-of-magnitude constant. |
| 216 | +_CATBOOST_BYTES_PER_TREE_NODE = 32 |
| 217 | + |
| 218 | + |
| 219 | +def _ram_for_linear(*, stats: DatasetStats, embedder_dim: int) -> float: |
| 220 | + """Float64 design matrix dominates; coefficients and L-BFGS history are small.""" |
| 221 | + data_bytes = 8.0 * stats.n_samples * embedder_dim |
| 222 | + coef_bytes = 8.0 * max(1, stats.n_classes) * embedder_dim |
| 223 | + lbfgs_bytes = 10.0 * 8.0 * embedder_dim |
| 224 | + return (data_bytes + coef_bytes + lbfgs_bytes) / _BYTES_PER_GB |
| 225 | + |
| 226 | + |
| 227 | +def _time_for_linear( |
| 228 | + *, |
| 229 | + n_trials: int, |
| 230 | + n_samples: int, |
| 231 | + embedder_dim: int, |
| 232 | + max_iter: int, |
| 233 | + cv_multiplier: int, |
| 234 | + class_multiplier: int, |
| 235 | +) -> float: |
| 236 | + """LogisticRegression wall time, in hours. |
| 237 | +
|
| 238 | + Cost is ``O(n_samples x n_features x max_iter x n_classes)`` per fit |
| 239 | + (sklearn's L-BFGS solver), multiplied by the CV inner-fit count (31 for the |
| 240 | + default LogisticRegressionCV). |
| 241 | + """ |
| 242 | + seconds = ( |
| 243 | + n_trials |
| 244 | + * _LINEAR_CPU_S_PER_SAMPLE_FEATURE_ITER |
| 245 | + * n_samples |
| 246 | + * embedder_dim |
| 247 | + * max_iter |
| 248 | + * cv_multiplier |
| 249 | + * class_multiplier |
| 250 | + ) |
| 251 | + return seconds / 3600.0 |
| 252 | + |
| 253 | + |
| 254 | +def _ram_for_catboost(*, stats: DatasetStats, n_features: int, iterations: int, depth: int) -> float: |
| 255 | + """CatBoost RAM = quantized data matrix + histograms + tree storage.""" |
| 256 | + data_bytes = 4.0 * stats.n_samples * n_features |
| 257 | + histograms_bytes = 4.0 * n_features * _CATBOOST_DEFAULT_BINS |
| 258 | + trees_bytes = iterations * (2**depth) * _CATBOOST_BYTES_PER_TREE_NODE |
| 259 | + return float((data_bytes + histograms_bytes + trees_bytes) / _BYTES_PER_GB) |
| 260 | + |
| 261 | + |
| 262 | +def _time_for_catboost( |
| 263 | + *, |
| 264 | + n_trials: int, |
| 265 | + n_samples: int, |
| 266 | + n_features: int, |
| 267 | + iterations: int, |
| 268 | + depth: int, |
| 269 | + class_multiplier: int, |
| 270 | + on_gpu: bool, |
| 271 | +) -> float: |
| 272 | + """CatBoost wall time, in hours. |
| 273 | +
|
| 274 | + Cost is ``O(iterations x n_samples x n_features x depth x n_classes)`` per |
| 275 | + fit. GPU training is ~10x faster than CPU for typical workloads per |
| 276 | + CatBoost's published benchmarks. |
| 277 | + https://catboost.ai/en/docs/concepts/speed-up-training |
| 278 | + """ |
| 279 | + coeff = _CATBOOST_CPU_S_PER_SAMPLE_FEATURE_ITER |
| 280 | + if on_gpu: |
| 281 | + coeff /= _CATBOOST_GPU_SPEEDUP |
| 282 | + seconds = n_trials * iterations * coeff * n_samples * n_features * depth * class_multiplier |
| 283 | + return seconds / 3600.0 |
| 284 | + |
| 285 | + |
| 286 | +def _floor_to_power_of_two(n: int) -> int: |
| 287 | + """Largest power of two <= ``n``; returns 0 when ``n < 1``.""" |
| 288 | + if n < 1: |
| 289 | + return 0 |
| 290 | + power = 1 |
| 291 | + while power * 2 <= n: |
| 292 | + power *= 2 |
| 293 | + return power |
0 commit comments