|
1 | 1 | # ============================================================================== |
2 | | -# Model: Mythos-26B-A4B-PRISM (Service: llama-gemma4) |
3 | | -# CCD Optimization: CCD1 (Frequency 8-15) for high-throughput multimodal |
4 | | -# Deep-Dive Analysis: 128 experts (16 on GPU). 3.5-bit weights. |
| 2 | +# Model: Mythos-26B-A4B-PRISM-ULTIMATE (Single-Model Maximum Performance) |
| 3 | +# CCD Optimization: ALL PHYSICAL CORES (0-15) for maximum KV/Prompt throughput |
| 4 | +# Deep-Dive Analysis: 128 experts (ALL on GPU). 3.5-bit weights. |
5 | 5 | # ============================================================================== |
6 | 6 |
|
7 | 7 | SERVICE_NAME="llama-gemma4.service" |
8 | 8 | MODEL_PATH="/home/siva/models/gemma-4-26B-A4B-it/Ex0bit/mythos-26b-a4b-prism-pro-dq.gguf" |
9 | 9 | MMPRJ_PATH="/home/siva/models/gemma-4-26B-A4B-it/Ex0bit/mmproj-mythos-26b-a4b-prism-pro.gguf" |
10 | 10 | MODEL_ALIAS="gemma-4-26b-a4b-it" |
11 | 11 |
|
12 | | -# --- COMPUTE (CCD1 Focus) --- |
| 12 | +# --- COMPUTE (Saturation Mode) --- |
| 13 | +# Utilizing all physical cores across both CCDs for maximum single-model burst. |
13 | 14 | CPU_AFFINITY="16-23,8-15" |
14 | 15 | THREADS=16 |
15 | 16 | THREADS_BATCH=16 |
16 | 17 | N_GPU_LAYERS=999 |
17 | | -# Tier: 80 experts on CPU (48 on GPU) to accommodate 128K q4_0 cache in 16GB. |
18 | | -N_CPU_MOE=64 |
| 18 | +# Balanced offloading: 16 experts on CPU to maximize GPU throughput. |
| 19 | +N_CPU_MOE=34 |
| 20 | +UBATCH_SIZE=1024 |
| 21 | +MLOCK=true |
| 22 | +MMPRJ_OFFLOAD=false |
| 23 | + |
19 | 24 |
|
20 | 25 | # --- MEMORY --- |
21 | | -# Use q4_0 cache for VRAM efficiency |
22 | | -CACHE_TYPE_K="q4_0" |
23 | | -CACHE_TYPE_V="q4_0" |
24 | | -# Optimized 32K context for high-context Qwen coexistence |
25 | | -CTX_SIZE=32768 |
| 26 | +# Use q8_0 cache for higher precision (Mandatory for PRISM-DQ fidelity) |
| 27 | +CACHE_TYPE_K="q8_0" |
| 28 | +CACHE_TYPE_V="q8_0" |
| 29 | +# High-context mode: 256K context with 2 parallel slots (VRAM Safety) |
| 30 | +CTX_SIZE=262144 |
26 | 31 | PARALLEL=2 |
27 | 32 | PORT=8081 |
28 | | -# Increased to 1024 for maximum prompt evaluation throughput on CCD1 |
29 | | -UBATCH_SIZE=1024 |
30 | | -BATCH_SIZE=2048 |
31 | 33 |
|
32 | 34 | # --- INFRASTRUCTURE --- |
33 | 35 | JINJA=true |
34 | 36 |
|
35 | | -# --- SPEC-SPECIFIC PARAMS (General & Creative Tasks) --- |
36 | | -# --prio is already set via PRIORITY var, no need to duplicate here |
37 | | -EXTRA_ARGS="" |
38 | | - |
39 | | -# Sampling (Official Gemma-4 Specs + PRISM-PRO-DQ Tuning) |
40 | | -TEMP=1.0 |
41 | | -MIN_P=0.02 |
42 | | -TOP_P=0.95 |
43 | | -TOP_K=64 |
44 | | -REPEAT_PENALTY=1.1 |
45 | | -REPEAT_LAST_N=128 |
46 | | -DRY_MULTIPLIER=0.8 |
47 | | -DRY_BASE=1.75 |
48 | | -DRY_ALLOWED_LENGTH=2 |
49 | | -DRY_PENALTY_LAST_N=4096 |
50 | | -# Optimized chain for PRISM creative performance |
51 | | -SAMPLERS="top_p;temperature" |
| 37 | +# --- SPEC-SPECIFIC PARAMS --- |
| 38 | +# High priority and distributed NUMA for cross-CCD memory access |
| 39 | +# EXTRA_ARGS="--prio 3 --flash-attn on --numa distribute" |
| 40 | +PRIORITY=3 |
| 41 | +FLASH_ATTN=on |
| 42 | +NUMA=distribute |
| 43 | + |
| 44 | +# Optimized for Abliterated PRISM-PRO-DQ |
| 45 | +# Reduced temperature slightly from 1.5 to 1.15 to preserve logic without safety anchors. |
| 46 | +#TEMP=1.15 |
| 47 | +#MIN_P=0.06 |
| 48 | +#TOP_P=0.95 |
| 49 | +#TOP_K=40 |
| 50 | +#REPEAT_PENALTY=1.0 |
| 51 | +#REPEAT_LAST_N=256 |
| 52 | +#DRY_MULTIPLIER=0.8 |
| 53 | +#DRY_BASE=1.75 |
| 54 | +#DRY_ALLOWED_LENGTH=2 |
| 55 | +#DRY_PENALTY_LAST_N=4096 |
| 56 | +#SAMPLERS="dry;top_k;top_p;min_p;temperature" |
52 | 57 |
|
53 | 58 | # Reasoning Budget (Native Gemma-4 Mode) |
54 | 59 | REASONING="auto" |
55 | 60 | REASONING_FORMAT="auto" |
56 | 61 | REASONING_BUDGET=-1 |
57 | | -# REASONING_BUDGET_MESSAGE removed — never fires when REASONING_BUDGET=-1 |
58 | 62 | REASONING_BUDGET_MESSAGE=" [Logic Finalized] " |
0 commit comments