Skip to content

Commit 6d1509f

Browse files
committed
chore: update config files and cleanup ultimate profile
1 parent 901bd96 commit 6d1509f

5 files changed

Lines changed: 129 additions & 420 deletions

File tree

scripts-local/gemma-4-26b-a4b-ultimate.conf

Lines changed: 0 additions & 58 deletions
This file was deleted.

scripts-local/gemma-4-26b-a4b.conf

Lines changed: 36 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -1,58 +1,62 @@
11
# ==============================================================================
2-
# Model: Mythos-26B-A4B-PRISM (Service: llama-gemma4)
3-
# CCD Optimization: CCD1 (Frequency 8-15) for high-throughput multimodal
4-
# Deep-Dive Analysis: 128 experts (16 on GPU). 3.5-bit weights.
2+
# Model: Mythos-26B-A4B-PRISM-ULTIMATE (Single-Model Maximum Performance)
3+
# CCD Optimization: ALL PHYSICAL CORES (0-15) for maximum KV/Prompt throughput
4+
# Deep-Dive Analysis: 128 experts (ALL on GPU). 3.5-bit weights.
55
# ==============================================================================
66

77
SERVICE_NAME="llama-gemma4.service"
88
MODEL_PATH="/home/siva/models/gemma-4-26B-A4B-it/Ex0bit/mythos-26b-a4b-prism-pro-dq.gguf"
99
MMPRJ_PATH="/home/siva/models/gemma-4-26B-A4B-it/Ex0bit/mmproj-mythos-26b-a4b-prism-pro.gguf"
1010
MODEL_ALIAS="gemma-4-26b-a4b-it"
1111

12-
# --- COMPUTE (CCD1 Focus) ---
12+
# --- COMPUTE (Saturation Mode) ---
13+
# Utilizing all physical cores across both CCDs for maximum single-model burst.
1314
CPU_AFFINITY="16-23,8-15"
1415
THREADS=16
1516
THREADS_BATCH=16
1617
N_GPU_LAYERS=999
17-
# Tier: 80 experts on CPU (48 on GPU) to accommodate 128K q4_0 cache in 16GB.
18-
N_CPU_MOE=64
18+
# Balanced offloading: 16 experts on CPU to maximize GPU throughput.
19+
N_CPU_MOE=34
20+
UBATCH_SIZE=1024
21+
MLOCK=true
22+
MMPRJ_OFFLOAD=false
23+
1924

2025
# --- MEMORY ---
21-
# Use q4_0 cache for VRAM efficiency
22-
CACHE_TYPE_K="q4_0"
23-
CACHE_TYPE_V="q4_0"
24-
# Optimized 32K context for high-context Qwen coexistence
25-
CTX_SIZE=32768
26+
# Use q8_0 cache for higher precision (Mandatory for PRISM-DQ fidelity)
27+
CACHE_TYPE_K="q8_0"
28+
CACHE_TYPE_V="q8_0"
29+
# High-context mode: 256K context with 2 parallel slots (VRAM Safety)
30+
CTX_SIZE=262144
2631
PARALLEL=2
2732
PORT=8081
28-
# Increased to 1024 for maximum prompt evaluation throughput on CCD1
29-
UBATCH_SIZE=1024
30-
BATCH_SIZE=2048
3133

3234
# --- INFRASTRUCTURE ---
3335
JINJA=true
3436

35-
# --- SPEC-SPECIFIC PARAMS (General & Creative Tasks) ---
36-
# --prio is already set via PRIORITY var, no need to duplicate here
37-
EXTRA_ARGS=""
38-
39-
# Sampling (Official Gemma-4 Specs + PRISM-PRO-DQ Tuning)
40-
TEMP=1.0
41-
MIN_P=0.02
42-
TOP_P=0.95
43-
TOP_K=64
44-
REPEAT_PENALTY=1.1
45-
REPEAT_LAST_N=128
46-
DRY_MULTIPLIER=0.8
47-
DRY_BASE=1.75
48-
DRY_ALLOWED_LENGTH=2
49-
DRY_PENALTY_LAST_N=4096
50-
# Optimized chain for PRISM creative performance
51-
SAMPLERS="top_p;temperature"
37+
# --- SPEC-SPECIFIC PARAMS ---
38+
# High priority and distributed NUMA for cross-CCD memory access
39+
# EXTRA_ARGS="--prio 3 --flash-attn on --numa distribute"
40+
PRIORITY=3
41+
FLASH_ATTN=on
42+
NUMA=distribute
43+
44+
# Optimized for Abliterated PRISM-PRO-DQ
45+
# Reduced temperature slightly from 1.5 to 1.15 to preserve logic without safety anchors.
46+
#TEMP=1.15
47+
#MIN_P=0.06
48+
#TOP_P=0.95
49+
#TOP_K=40
50+
#REPEAT_PENALTY=1.0
51+
#REPEAT_LAST_N=256
52+
#DRY_MULTIPLIER=0.8
53+
#DRY_BASE=1.75
54+
#DRY_ALLOWED_LENGTH=2
55+
#DRY_PENALTY_LAST_N=4096
56+
#SAMPLERS="dry;top_k;top_p;min_p;temperature"
5257

5358
# Reasoning Budget (Native Gemma-4 Mode)
5459
REASONING="auto"
5560
REASONING_FORMAT="auto"
5661
REASONING_BUDGET=-1
57-
# REASONING_BUDGET_MESSAGE removed — never fires when REASONING_BUDGET=-1
5862
REASONING_BUDGET_MESSAGE=" [Logic Finalized] "

0 commit comments

Comments
 (0)