saas-home
diff --git a/‎scripts-local/gemma-4-26b-a4b-ultimate.conf‎
Lines changed: 0 additions & 58 deletions b/‎scripts-local/gemma-4-26b-a4b-ultimate.conf‎
Lines changed: 0 additions & 58 deletions
diff --git a/‎scripts-local/gemma-4-26b-a4b.conf‎
Lines changed: 36 additions & 32 deletions b/‎scripts-local/gemma-4-26b-a4b.conf‎
Lines changed: 36 additions & 32 deletions
@@ -1,58 +1,62 @@
 # ==============================================================================
-# Model: Mythos-26B-A4B-PRISM (Service: llama-gemma4)
-# CCD Optimization: CCD1 (Frequency 8-15) for high-throughput multimodal
-# Deep-Dive Analysis: 128 experts (16 on GPU). 3.5-bit weights.
+# Model: Mythos-26B-A4B-PRISM-ULTIMATE (Single-Model Maximum Performance)
+# CCD Optimization: ALL PHYSICAL CORES (0-15) for maximum KV/Prompt throughput
+# Deep-Dive Analysis: 128 experts (ALL on GPU). 3.5-bit weights.
 # ==============================================================================
 
 SERVICE_NAME="llama-gemma4.service"
 MODEL_PATH="/home/siva/models/gemma-4-26B-A4B-it/Ex0bit/mythos-26b-a4b-prism-pro-dq.gguf"
 MMPRJ_PATH="/home/siva/models/gemma-4-26B-A4B-it/Ex0bit/mmproj-mythos-26b-a4b-prism-pro.gguf"
 MODEL_ALIAS="gemma-4-26b-a4b-it"
 
-# --- COMPUTE (CCD1 Focus) ---
+# --- COMPUTE (Saturation Mode) ---
+# Utilizing all physical cores across both CCDs for maximum single-model burst.
 CPU_AFFINITY="16-23,8-15"
 THREADS=16
 THREADS_BATCH=16
 N_GPU_LAYERS=999
-# Tier: 80 experts on CPU (48 on GPU) to accommodate 128K q4_0 cache in 16GB.
-N_CPU_MOE=64
+# Balanced offloading: 16 experts on CPU to maximize GPU throughput.
+N_CPU_MOE=34
+UBATCH_SIZE=1024
+MLOCK=true
+MMPRJ_OFFLOAD=false
+
 
 # --- MEMORY ---
-# Use q4_0 cache for VRAM efficiency
-CACHE_TYPE_K="q4_0"
-CACHE_TYPE_V="q4_0"
-# Optimized 32K context for high-context Qwen coexistence
-CTX_SIZE=32768
+# Use q8_0 cache for higher precision (Mandatory for PRISM-DQ fidelity)
+CACHE_TYPE_K="q8_0"
+CACHE_TYPE_V="q8_0"
+# High-context mode: 256K context with 2 parallel slots (VRAM Safety)
+CTX_SIZE=262144
 PARALLEL=2
 PORT=8081
-# Increased to 1024 for maximum prompt evaluation throughput on CCD1
-UBATCH_SIZE=1024
-BATCH_SIZE=2048
 
 # --- INFRASTRUCTURE ---
 JINJA=true
 
-# --- SPEC-SPECIFIC PARAMS (General & Creative Tasks) ---
-# --prio is already set via PRIORITY var, no need to duplicate here
-EXTRA_ARGS=""
-
-# Sampling (Official Gemma-4 Specs + PRISM-PRO-DQ Tuning)
-TEMP=1.0
-MIN_P=0.02
-TOP_P=0.95
-TOP_K=64
-REPEAT_PENALTY=1.1
-REPEAT_LAST_N=128
-DRY_MULTIPLIER=0.8
-DRY_BASE=1.75
-DRY_ALLOWED_LENGTH=2
-DRY_PENALTY_LAST_N=4096
-# Optimized chain for PRISM creative performance
-SAMPLERS="top_p;temperature"
+# --- SPEC-SPECIFIC PARAMS ---
+# High priority and distributed NUMA for cross-CCD memory access
+# EXTRA_ARGS="--prio 3 --flash-attn on --numa distribute"
+PRIORITY=3
+FLASH_ATTN=on
+NUMA=distribute
+
+# Optimized for Abliterated PRISM-PRO-DQ
+# Reduced temperature slightly from 1.5 to 1.15 to preserve logic without safety anchors.
+#TEMP=1.15
+#MIN_P=0.06
+#TOP_P=0.95
+#TOP_K=40
+#REPEAT_PENALTY=1.0
+#REPEAT_LAST_N=256
+#DRY_MULTIPLIER=0.8
+#DRY_BASE=1.75
+#DRY_ALLOWED_LENGTH=2
+#DRY_PENALTY_LAST_N=4096
+#SAMPLERS="dry;top_k;top_p;min_p;temperature"
 
 # Reasoning Budget (Native Gemma-4 Mode)
 REASONING="auto"
 REASONING_FORMAT="auto"
 REASONING_BUDGET=-1
-# REASONING_BUDGET_MESSAGE removed — never fires when REASONING_BUDGET=-1
 REASONING_BUDGET_MESSAGE=" [Logic Finalized] "