Skip to content

Commit 1841d08

Browse files
committed
commit missing files
1 parent 7cb0f53 commit 1841d08

5 files changed

Lines changed: 944 additions & 0 deletions

File tree

src/autointent/_advisor/_estimates/__init__.py

Whitespace-only changes.
Lines changed: 293 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,293 @@
1+
"""Pure cost-estimate formulas — VRAM, RAM, time, severity, model shape.
2+
3+
No I/O, no logging, no orchestration. Each formula docstring links to the
4+
reference it was calibrated against so a reviewer can follow each coefficient
5+
back to its source.
6+
7+
Conventions:
8+
* All ``*_gb`` results use the binary GiB convention (1024**3 bytes per GB) —
9+
matches the rest of the advisor's byte->GB conversions.
10+
* All ``*_hours`` results assume the GPU baseline of ~1 second per step;
11+
CPU runs pay a flat slowdown factor (see ``_time_for_transformer``).
12+
* "fp32 worst case" — we deliberately ignore lower-precision / FlashAttention /
13+
quantization optimizations, per the advisor's "pessimistic upper bound" contract.
14+
"""
15+
16+
from __future__ import annotations
17+
18+
from typing import TYPE_CHECKING
19+
20+
from autointent._advisor._report import Severity
21+
22+
if TYPE_CHECKING:
23+
from autointent._advisor._hub import ModelMeta
24+
from autointent._advisor._report import DatasetStats
25+
26+
27+
_BYTES_PER_GB = 1024**3
28+
_DEFAULT_SEQ_LEN = 128
29+
30+
# Fallback architecture shape (BERT-base) used only when the model's actual
31+
# config.json couldn't be fetched from HF Hub — see _hub._shape_from_config.
32+
_DEFAULT_HIDDEN = 768
33+
_DEFAULT_LAYERS = 12
34+
35+
_TIGHT_RATIO = 0.9
36+
_MULTICLASS_THRESHOLD = 2
37+
38+
39+
def _classify_severity(estimate: float, budget: float) -> Severity:
40+
"""Map a ``(estimate, budget)`` pair onto a Severity bucket.
41+
42+
* AMPLE: ``estimate <= 0`` OR ``ratio < _TIGHT_RATIO``
43+
* TIGHT: ``budget <= 0`` OR ``_TIGHT_RATIO <= ratio < 1``
44+
* OVER: ``ratio >= 1``
45+
"""
46+
if estimate <= 0:
47+
return Severity.AMPLE
48+
if budget <= 0:
49+
return Severity.TIGHT
50+
ratio = estimate / budget
51+
if ratio >= 1:
52+
return Severity.OVER
53+
if ratio >= _TIGHT_RATIO:
54+
return Severity.TIGHT
55+
return Severity.AMPLE
56+
57+
58+
def _weights_vram_for_transformer(meta: ModelMeta, mode: str) -> float:
59+
"""Weight-side VRAM in GB — weights + grads + optimizer state. Excludes activations.
60+
61+
Returns a deliberately pessimistic upper bound, matching the advisor's
62+
"heuristic upper bound, not measurement" contract.
63+
64+
Modes:
65+
* ``inference``: forward only — weights + ~30% intermediate-tensor overhead.
66+
* ``lora``: frozen base + small trainable adapters + their grads/optimizer (~0.5 GB).
67+
* ``full-finetune`` (default): the textbook 4W (weights + grads + Adam m + Adam v).
68+
We use 4.5W to leave headroom for loss-scale buffers, allocator fragmentation,
69+
cuDNN workspaces, and gradient-accumulation buffers — none of which the textbook
70+
4W accounting captures.
71+
"""
72+
weights_gb = meta.weights_gb
73+
if mode == "inference":
74+
return weights_gb * 1.3
75+
if mode == "lora":
76+
return weights_gb * 1.3 + 0.5
77+
return weights_gb * 4.5
78+
79+
80+
def _activations_gb_per_sample(
81+
meta: ModelMeta | None,
82+
seq_len: int,
83+
*,
84+
is_training: bool,
85+
) -> float:
86+
"""Heuristic activation memory per sample, assuming a fp32 worst case.
87+
88+
Training: ``seq_len x hidden x layers x const`` — per-layer outputs are kept
89+
for backward.
90+
Inference: ``seq_len x hidden x const`` — only one or two layers' outputs in
91+
flight at once.
92+
"""
93+
hidden = _embedder_dim(meta)
94+
# Training keeps every layer's outputs for backward -> scales x n_layers.
95+
# 16 bytes/token/layer ~ fp32 activation (4B) x ~4x backward overhead (Korthikanti et al.).
96+
# Inference only holds ~1-2 layers' outputs in flight at once.
97+
bytes_per_sample = seq_len * hidden * _n_layers(meta) * 16 if is_training else seq_len * hidden * 8
98+
return bytes_per_sample / _BYTES_PER_GB
99+
100+
101+
def _vram_for_transformer(
102+
meta: ModelMeta,
103+
mode: str,
104+
*,
105+
batch_size: int = 0,
106+
seq_len: int = _DEFAULT_SEQ_LEN,
107+
) -> float:
108+
"""Total VRAM in GB: weights + grads + optimizer state + activations x batch.
109+
110+
Activation accounting differs by mode — training keeps per-layer outputs for
111+
backward; inference only needs one or two layers in flight.
112+
"""
113+
base = _weights_vram_for_transformer(meta, mode)
114+
if batch_size <= 0:
115+
return base
116+
per_sample = _activations_gb_per_sample(meta, seq_len, is_training=mode != "inference")
117+
return base + per_sample * batch_size
118+
119+
120+
def _max_fitting_batch_size(
121+
*,
122+
weight_vram_gb: float,
123+
vram_budget_gb: float,
124+
per_sample_gb: float,
125+
) -> int:
126+
"""Largest batch that keeps total VRAM under the AMPLE/TIGHT threshold.
127+
128+
Returns 0 when even the weights blow the budget. Result is rounded down to
129+
the nearest power of two
130+
"""
131+
if per_sample_gb <= 0:
132+
return 0
133+
target_vram = vram_budget_gb * _TIGHT_RATIO
134+
available_for_activations = target_vram - weight_vram_gb
135+
if available_for_activations <= 0:
136+
return 0
137+
return _floor_to_power_of_two(int(available_for_activations / per_sample_gb))
138+
139+
140+
_CPU_SLOWDOWN_FACTOR = 50.0
141+
"""Rough multiplier for transformer training on CPU vs. a modern GPU.
142+
143+
Real benchmarks vary widely (30x for small BERTs on AVX-512 boxes, 100x+ for
144+
billion-scale models on a stock laptop). A single 50x constant is a pessimistic
145+
upper bound that's good enough to make the CPU/GPU distinction visible without
146+
re-introducing the per-device tier table."""
147+
148+
149+
def _time_for_transformer(
150+
*,
151+
n_trials: int,
152+
epochs: int,
153+
batch_size: int,
154+
n_samples: int,
155+
accelerator: str,
156+
) -> float:
157+
"""Transformer training time in hours.
158+
159+
Baseline is "1 second per step" on a GPU (CUDA / MPS) — a step-count proxy,
160+
not a real wall-time calibration. CPU training pays a flat ``_CPU_SLOWDOWN_FACTOR``
161+
so the report doesn't hide the fact that the same workload is dramatically
162+
slower without a GPU. Users should treat absolute numbers as ordering /
163+
ballpark information, not a budget.
164+
"""
165+
steps = max(1, (n_samples // max(1, batch_size))) * epochs
166+
h = (n_trials * steps) / 3600.0
167+
if accelerator == "cpu":
168+
h *= _CPU_SLOWDOWN_FACTOR
169+
return h
170+
171+
172+
def _n_layers(meta: ModelMeta | None) -> int:
173+
"""Layer count from the model's ``config.json``; falls back to BERT-base when absent."""
174+
if meta is not None and meta.n_layers is not None:
175+
return meta.n_layers
176+
return _DEFAULT_LAYERS
177+
178+
179+
def _embedder_dim(meta: ModelMeta | None) -> int:
180+
"""Hidden size from the model's ``config.json``; falls back to BERT-base when absent."""
181+
if meta is not None and meta.hidden_size is not None:
182+
return meta.hidden_size
183+
return _DEFAULT_HIDDEN
184+
185+
186+
def _largest_embedder(seen_models: dict[str, ModelMeta]) -> ModelMeta | None:
187+
"""Return the largest model in ``seen_models`` by parameter count, or None if empty."""
188+
if not seen_models:
189+
return None
190+
return max(seen_models.values(), key=lambda m: m.total_params)
191+
192+
193+
def _ram_for_module(meta: ModelMeta, stats: DatasetStats) -> float:
194+
"""RAM in GB. Loose upper bound: weights + tokenized text in memory.
195+
196+
Tokenized text is approximated as ``n_samples x avg_tokens x 4 bytes``
197+
(BPE/WordPiece token ids fit in int32). The 4 bytes/token bound is tight
198+
enough for the report's purposes and intentionally ignores any preprocessing
199+
artefacts (attention masks, position ids, etc.) since they're bounded by the
200+
same factor.
201+
"""
202+
return meta.weights_gb + (stats.n_samples * stats.avg_tokens * 4) / _BYTES_PER_GB
203+
204+
205+
# Coefficients are dimensional (per-sample-per-feature-per-iteration seconds)
206+
# rather than empirically tuned constants — they give relative-cost ordering
207+
# across configurations and absolute ballpark wall-times.
208+
_LINEAR_CPU_S_PER_SAMPLE_FEATURE_ITER = 1e-8
209+
_CATBOOST_CPU_S_PER_SAMPLE_FEATURE_ITER = 1e-9
210+
_CATBOOST_GPU_SPEEDUP = 10.0
211+
# LogisticRegressionCV defaults: Cs=10, cv=3 -> 10x3 inner fits + 1 final refit = 31.
212+
_LOGREG_CV_MULTIPLIER = 31
213+
# Default value of `border_count` in CatBoost (number of histogram buckets per feature).
214+
_CATBOOST_DEFAULT_BINS = 254
215+
# Bytes per histogram bucket / tree node — order-of-magnitude constant.
216+
_CATBOOST_BYTES_PER_TREE_NODE = 32
217+
218+
219+
def _ram_for_linear(*, stats: DatasetStats, embedder_dim: int) -> float:
220+
"""Float64 design matrix dominates; coefficients and L-BFGS history are small."""
221+
data_bytes = 8.0 * stats.n_samples * embedder_dim
222+
coef_bytes = 8.0 * max(1, stats.n_classes) * embedder_dim
223+
lbfgs_bytes = 10.0 * 8.0 * embedder_dim
224+
return (data_bytes + coef_bytes + lbfgs_bytes) / _BYTES_PER_GB
225+
226+
227+
def _time_for_linear(
228+
*,
229+
n_trials: int,
230+
n_samples: int,
231+
embedder_dim: int,
232+
max_iter: int,
233+
cv_multiplier: int,
234+
class_multiplier: int,
235+
) -> float:
236+
"""LogisticRegression wall time, in hours.
237+
238+
Cost is ``O(n_samples x n_features x max_iter x n_classes)`` per fit
239+
(sklearn's L-BFGS solver), multiplied by the CV inner-fit count (31 for the
240+
default LogisticRegressionCV).
241+
"""
242+
seconds = (
243+
n_trials
244+
* _LINEAR_CPU_S_PER_SAMPLE_FEATURE_ITER
245+
* n_samples
246+
* embedder_dim
247+
* max_iter
248+
* cv_multiplier
249+
* class_multiplier
250+
)
251+
return seconds / 3600.0
252+
253+
254+
def _ram_for_catboost(*, stats: DatasetStats, n_features: int, iterations: int, depth: int) -> float:
255+
"""CatBoost RAM = quantized data matrix + histograms + tree storage."""
256+
data_bytes = 4.0 * stats.n_samples * n_features
257+
histograms_bytes = 4.0 * n_features * _CATBOOST_DEFAULT_BINS
258+
trees_bytes = iterations * (2**depth) * _CATBOOST_BYTES_PER_TREE_NODE
259+
return float((data_bytes + histograms_bytes + trees_bytes) / _BYTES_PER_GB)
260+
261+
262+
def _time_for_catboost(
263+
*,
264+
n_trials: int,
265+
n_samples: int,
266+
n_features: int,
267+
iterations: int,
268+
depth: int,
269+
class_multiplier: int,
270+
on_gpu: bool,
271+
) -> float:
272+
"""CatBoost wall time, in hours.
273+
274+
Cost is ``O(iterations x n_samples x n_features x depth x n_classes)`` per
275+
fit. GPU training is ~10x faster than CPU for typical workloads per
276+
CatBoost's published benchmarks.
277+
https://catboost.ai/en/docs/concepts/speed-up-training
278+
"""
279+
coeff = _CATBOOST_CPU_S_PER_SAMPLE_FEATURE_ITER
280+
if on_gpu:
281+
coeff /= _CATBOOST_GPU_SPEEDUP
282+
seconds = n_trials * iterations * coeff * n_samples * n_features * depth * class_multiplier
283+
return seconds / 3600.0
284+
285+
286+
def _floor_to_power_of_two(n: int) -> int:
287+
"""Largest power of two <= ``n``; returns 0 when ``n < 1``."""
288+
if n < 1:
289+
return 0
290+
power = 1
291+
while power * 2 <= n:
292+
power *= 2
293+
return power

0 commit comments

Comments
 (0)