|
87 | 87 |
|
88 | 88 | evaluate_perplexity: true |
89 | 89 | evaluation_num_samples: 100 |
| 90 | + # Use NVIDIA Minitron official few-shot settings for downstream tasks |
| 91 | + # (MMLU 5-shot, HellaSwag 10-shot, ARC 25-shot, WinoGrande 5-shot, etc.). |
| 92 | + use_nvidia_fewshot: true |
| 93 | + # Match OATS Table 19 / common pruning-paper protocol for WikiText-2 perplexity: |
| 94 | + # concatenate full test set and evaluate in contiguous 2048-token blocks (no padding). |
| 95 | + perplexity_protocol: "oats" |
| 96 | + wikitext_subset: "wikitext-2-raw-v1" |
| 97 | + perplexity_seq_len: 2048 |
90 | 98 |
|
91 | 99 | evaluation_metrics: |
92 | 100 | # Language modeling |
|
99 | 107 | - "accuracy_hellaswag" |
100 | 108 | - "accuracy_arc_easy" |
101 | 109 | - "accuracy_arc_challenge" |
| 110 | + - "accuracy_openbookqa" |
102 | 111 |
|
103 | 112 | # Common Sense |
104 | 113 | - "accuracy_winogrande" |
@@ -174,6 +183,21 @@ supernode: |
174 | 183 | core_fraction: 0.01 |
175 | 184 | follower_fraction: 0.10 |
176 | 185 | halo_fraction: 0.10 |
| 186 | + # Connectivity definition (SCAR-Conn): fraction of a channel's down_proj write-mass |
| 187 | + # that lands on the top-K hidden dimensions most written-to by supernodes. |
| 188 | + # (Avoids the ~1/hidden_dim collapse of L1-normalized dot-product overlap for dense matrices.) |
| 189 | + connectivity_topk: 256 |
| 190 | + # Optional post-processing for Conn (defaults keep current behavior) |
| 191 | + connectivity_rank_normalize: false |
| 192 | + connectivity_power: 1.0 |
| 193 | + # Analysis-only: also estimate redundancy-to-core for a small random sample of non-halo channels |
| 194 | + # (used for paper mechanism plots; does NOT affect pruning decisions). |
| 195 | + non_halo_sample_size: 256 |
| 196 | + non_halo_sample_seed: 0 |
| 197 | + # Protection mapping (rank-power): Protect = alpha + (1-alpha)*(1 - rank^gamma) |
| 198 | + protection_normalization: "rank_power" |
| 199 | + protection_rank_power: 8.0 |
| 200 | + protection_floor: 0.2 |
177 | 201 | protect_core: true |
178 | 202 | # Apply hard supernode protection only for the listed pruning metrics. |
179 | 203 | # If omitted, legacy behavior is to protect for *all* pruning metrics. |
@@ -286,7 +310,9 @@ pruning: |
286 | 310 | dependency_aware: true |
287 | 311 |
|
288 | 312 | sparsity_levels: [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7] |
289 | | - selection_modes: ["low", "high"] |
| 313 | + # We only report (and run) the standard pruning direction: prune *low*-scoring channels. |
| 314 | + # The "high" mode (prune highest scores) is a pathological control and is excluded from paper runs. |
| 315 | + selection_modes: ["low"] |
290 | 316 |
|
291 | 317 | # ALL algorithms including SOTA baselines |
292 | 318 | algorithms: |
|
0 commit comments