@@ -667,158 +163,157 @@

- +
- 🔹 - Standard Transformer (Llama, Mistral, GLM, MiniMax, etc.) + 🔸 + MLA (Multi-head Latent Attention)
- Single-GPU KV Cache = 2 × layers × tokens × batch × hidden_size × (kv_heads / attn_heads) / tp × dtype_size + Single-GPU KV Cache = layers × tokens × batch × (kv_lora_rank + qk_rope_head_dim) / tp × dtype_size
- 2: - Key and Value matrices -
-
- layers: - Number of hidden layers + Note: + No factor of 2 because K and V are compressed together into latent space
- tokens: - Sequence length + kv_lora_rank: + Compressed KV dimension
- batch: - Batch size + qk_rope_head_dim: + RoPE positional encoding dimension
- hidden_size: - Hidden dimension + Models: + DeepSeek V3/R1/V3.2, Kimi K2/K2.5, GLM-5
+
+
+
+ + +
+
+ 🔹 + GQA (Grouped-Query Attention) +
+
+
+ Single-GPU KV Cache = 2 × layers × tokens × batch × kv_heads × head_dim / tp × dtype_size +
+
- (kv_heads / attn_heads): - GQA compression ratio + 2: + Key and Value matrices
- tp: - Tensor parallelism (splits KV cache across GPUs) + kv_heads: + Number of KV heads (less than attention heads)
- dp: - Data parallelism (does NOT affect single-GPU KV cache) + head_dim: + Dimension per attention head
- dtype_size: - Data type size (float16=2, float32=4, etc.) + Models: + Qwen3, GLM-4.7, GLM-4.5, MiniMax-M2 series
- -
+ +
- 🔸 - MLA (DeepSeek V3/R1, Kimi K2) + ⚠️ + Hybrid (Mixed Architecture)
- Single-GPU KV Cache = layers × tokens × batch × (kv_lora_rank + qk_rope_head_dim) / tp × dtype_size + Single-GPU KV Cache = 2 × attn_layers × tokens × batch × kv_heads × head_dim / tp × dtype_size
- Note: - No factor of 2 because K and V are compressed together -
-
- kv_lora_rank: - Compressed KV dimension -
-
- qk_rope_head_dim: - RoPE positional encoding dimension + Warning: + Calculation may not be accurate for hybrid architectures. Further adaptation needed.
- tp: - Tensor parallelism (splits KV cache across GPUs) + Linear + Full Attention: + layer_types contains "linear_attention" and "full_attention" (e.g., Qwen3.5 series)
- dp: - Data parallelism (does NOT affect single-GPU KV cache) + Sliding + Full Attention: + layer_types contains "sliding_attention" and "full_attention" + sliding_window (e.g., Gemma4, GPT-OSS)
- Advantage: - Significantly reduces KV cache memory usage + Attention + SSM: + hybrid_layer_pattern with 0=SSM, 1=Attention (e.g., MiMo-V2-Flash)
Models: - DeepSeek V3/R1/V3.1/V3.2, Kimi K2 + Qwen3.5 series
- +
🔹 - Qwen3 (GQA with explicit head_dim) + Standard Transformer (MHA/MQA)
- Single-GPU KV Cache = 2 × layers × tokens × batch × kv_heads × head_dim / tp × dtype_size + Single-GPU KV Cache = 2 × layers × tokens × batch × hidden_size × (kv_heads / attn_heads) / tp × dtype_size
- head_dim: - Dimension per attention head (hidden_size / attn_heads) -
-
- kv_heads: - Number of KV heads (usually less than attention heads) + MHA: + kv_heads = attn_heads (every head has its own KV)
- tp: - Tensor parallelism (splits KV cache across GPUs) -
-
- dp: - Data parallelism (does NOT affect single-GPU KV cache) -
-
- Difference: - Uses explicit head_dim instead of hidden_size + MQA: + kv_heads = 1 (all heads share single KV)
Models: - Qwen3-32B/235B-A22B/480B-A35B, etc. + GPT-2 (MHA), PaLM/BLOOM (MQA)
-
+
🔍 - How to Identify Model Type + How to Identify Model Type from Config
- MLA: - Check for kv_lora_rank and qk_rope_head_dim in config + 1. MLA: + Check for kv_lora_rank + qk_rope_head_dim
- Qwen3: - Check for head_dim in config (but not MLA) + 2. Hybrid: + layer_types with multiple types, or hybrid_layer_pattern, or sliding_window with mixed layers
- Standard: - Only has hidden_size, num_attention_heads, num_key_value_heads + 3. GQA: + Has head_dim and kv_heads < attn_heads +
+
+ 4. Standard: + Falls through to MHA/MQA/GQA based on kv_heads ratio +
+
+ Multimodal: + Config nested in text_config
@@ -830,1614 +325,8 @@

- - \ No newline at end of file + + + + + diff --git a/docs/source/_static/model-configs.js b/docs/source/_static/model-configs.js new file mode 100644 index 000000000..2aba433a4 --- /dev/null +++ b/docs/source/_static/model-configs.js @@ -0,0 +1,263 @@ +/** + * KV Cache Calculator - Model Configurations & Translations + * + * This file contains: + * 1. Translation strings (English only) + * 2. Embedded preset model configurations + */ + +// English-only translations +const translations = { + en: { + 'title': 'KV Cache Size Calculator', + 'subtitle': 'Calculate KV cache size for large language models', + 'input-panel': 'Configuration', + 'model-source': 'Model Source', + 'preset-models': 'Preset Models', + 'custom-model': 'Custom Model', + 'select-model': 'Select Model', + 'loading': 'Loading models...', + 'model-url': 'Model URL', + 'data-type': 'Data Type', + 'token-count': 'Number of Tokens', + 'batch-size': 'Batch Size', + 'tp': 'Tensor Parallelism (TP)', + 'dp': 'Data Parallelism (DP)', + 'gpu-memory': 'Single-GPU Memory for KV Cache (GB)', + 'gpu-memory-hint': 'Memory available for KV cache (excluding model weights)', + 'calculate': 'Calculate KV Cache', + 'max-tokens-calculator': 'Maximum Tokens Calculator', + 'calculate-max-tokens': 'Calculate Max Tokens', + 'results': 'Results', + 'no-results': 'Configure your model and click calculate to see results.', + 'calculation-details': 'Calculation Details', + 'footer': 'KV Cache Calculator', + 'close': 'Close', + 'error': 'Error', + 'success': 'Success', + 'warning': 'Warning', + 'invalid-tokens': 'Please enter a valid number of tokens.', + 'model-not-found': 'Model configuration not found.', + 'calculation-success': 'KV cache size calculated successfully!', + 'model-url-invalid': 'Please enter a valid model URL.', + 'fetch-error': 'Failed to fetch model configuration. Please check the URL and try again.', + 'calculating': 'Calculating...' + } +}; + +/** + * Get embedded model configurations + * Updated with 2025 mainstream models + */ +function getEmbeddedModelConfigs() { + return { + // DeepSeek Models + "deepseek-ai/DeepSeek-V3": { + "hidden_size": 7168, + "num_attention_heads": 128, + "num_hidden_layers": 61, + "num_key_value_heads": 128, + "kv_lora_rank": 512, + "qk_rope_head_dim": 64 + }, + "deepseek-ai/DeepSeek-R1": { + "hidden_size": 7168, + "num_attention_heads": 128, + "num_hidden_layers": 61, + "num_key_value_heads": 128, + "kv_lora_rank": 512, + "qk_rope_head_dim": 64 + }, + "deepseek-ai/DeepSeek-V3.1-Terminus": { + "hidden_size": 7168, + "num_attention_heads": 128, + "num_hidden_layers": 61, + "num_key_value_heads": 128, + "kv_lora_rank": 512, + "qk_rope_head_dim": 64 + }, + "deepseek-ai/DeepSeek-V3.2": { + "hidden_size": 7168, + "num_attention_heads": 128, + "num_hidden_layers": 61, + "num_key_value_heads": 128, + "kv_lora_rank": 512, + "qk_rope_head_dim": 64 + }, + // Qwen3 Models + "Qwen/Qwen3-32B": { + "hidden_size": 5120, + "num_attention_heads": 64, + "num_hidden_layers": 64, + "num_key_value_heads": 8, + "head_dim": 128 + }, + "Qwen/Qwen3-235B-A22B": { + "hidden_size": 4096, + "num_attention_heads": 64, + "num_hidden_layers": 94, + "num_key_value_heads": 4, + "head_dim": 128 + }, + "Qwen/Qwen3-Coder-480B-A35B-Instruct": { + "hidden_size": 6144, + "num_attention_heads": 96, + "num_hidden_layers": 62, + "num_key_value_heads": 8, + "head_dim": 128 + }, + "Qwen/Qwen3-14B": { + "hidden_size": 5120, + "num_attention_heads": 40, + "num_hidden_layers": 40, + "num_key_value_heads": 8, + "head_dim": 128 + }, + "Qwen/Qwen2.5-7B-Instruct": { + "hidden_size": 3584, + "num_attention_heads": 28, + "num_hidden_layers": 28, + "num_key_value_heads": 4 + }, + "Qwen/Qwen-7B": { + "hidden_size": 4096, + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 32 + }, + // Qwen3.5 Series (GQA with explicit head_dim, some Hybrid) + "Qwen/Qwen3.5-397B-A17B": { + "hidden_size": 4096, + "num_attention_heads": 32, + "num_hidden_layers": 60, + "num_key_value_heads": 2, + "head_dim": 256, + "is_hybrid": true + }, + "Qwen/Qwen3.5-122B-A10B": { + "hidden_size": 3072, + "num_attention_heads": 32, + "num_hidden_layers": 48, + "num_key_value_heads": 2, + "head_dim": 256, + "is_hybrid": true + }, + "Qwen/Qwen3.5-35B-A3B": { + "hidden_size": 2048, + "num_attention_heads": 16, + "num_hidden_layers": 40, + "num_key_value_heads": 2, + "head_dim": 256, + "is_hybrid": true + }, + "Qwen/Qwen3.5-27B": { + "hidden_size": 5120, + "num_attention_heads": 40, + "num_hidden_layers": 64, + "num_key_value_heads": 4, + "head_dim": 256, + "is_hybrid": true + }, + // Llama Models + "meta-llama/Llama-3.1-70B-Instruct": { + "hidden_size": 8192, + "num_attention_heads": 64, + "num_hidden_layers": 80, + "num_key_value_heads": 8 + }, + "meta-llama/Llama-3.1-405B": { + "hidden_size": 16384, + "num_attention_heads": 128, + "num_hidden_layers": 126, + "num_key_value_heads": 8 + }, + // GLM Series + // GQA + "zai-org/GLM-4.5":{ + "hidden_size": 5120, + "num_attention_heads": 96, + "num_hidden_layers": 92, + "num_key_value_heads": 8, + "head_dim": 128 + }, + "zai-org/GLM-4.5-Air": { + "hidden_size": 4096, + "num_attention_heads": 96, + "num_hidden_layers": 46, + "num_key_value_heads": 8, + "head_dim": 128 + }, + "zai-org/GLM-4.7": { + "hidden_size": 5120, + "num_attention_heads": 96, + "num_hidden_layers": 92, + "num_key_value_heads": 8, + "head_dim": 128 + }, + // MLA + "zai-org/GLM-4.7-Flash": { + "hidden_size": 2048, + "num_attention_heads": 20, + "num_hidden_layers": 47, + "num_key_value_heads": 20, + "kv_lora_rank": 512, + "qk_rope_head_dim": 64 + }, + "zai-org/GLM-5": { + "hidden_size": 6144, + "num_attention_heads": 64, + "num_hidden_layers": 78, + "num_key_value_heads": 64, + "kv_lora_rank": 512, + "qk_rope_head_dim": 64 + }, + "zai-org/GLM-5.1": { + "hidden_size": 6144, + "num_attention_heads": 64, + "num_hidden_layers": 78, + "num_key_value_heads": 64, + "kv_lora_rank": 512, + "qk_rope_head_dim": 64 + }, + + // MiniMax Series (GQA + MoE) + "minimax/MiniMax-M2.5": { + "hidden_size": 3072, + "num_attention_heads": 48, + "num_hidden_layers": 62, + "num_key_value_heads": 8, + "head_dim": 128 + }, + "minimax/MiniMax-M2.1": { + "hidden_size": 3072, + "num_attention_heads": 48, + "num_hidden_layers": 62, + "num_key_value_heads": 8, + "head_dim": 128 + }, + "minimax/MiniMax-M2": { + "hidden_size": 3072, + "num_attention_heads": 48, + "num_hidden_layers": 62, + "num_key_value_heads": 8, + "head_dim": 128 + }, + // Kimi Series (MLA + Multimodal) + "moonshot/Kimi-K2.5": { + "hidden_size": 7168, + "num_attention_heads": 64, + "num_hidden_layers": 61, + "num_key_value_heads": 64, + "kv_lora_rank": 512, + "qk_rope_head_dim": 64 + }, + "moonshot/Kimi-K2": { + "hidden_size": 7168, + "num_attention_heads": 64, + "num_hidden_layers": 61, + "num_key_value_heads": 64, + "kv_lora_rank": 512, + "qk_rope_head_dim": 64 + } + }; +} diff --git a/docs/source/_static/styles.css b/docs/source/_static/styles.css new file mode 100644 index 000000000..557bb30df --- /dev/null +++ b/docs/source/_static/styles.css @@ -0,0 +1,510 @@ +/* CSS Variables for Theme Support - fully compatible with Sphinx themes */ +:root { + /* Use system colors and transparent backgrounds to match Sphinx */ + --bg-primary: transparent; + --bg-secondary: rgba(248, 250, 252, 0.5); + --bg-card: rgba(255, 255, 255, 0.8); + --text-primary: currentColor; + --text-secondary: currentColor; + --border-color: rgba(128, 128, 128, 0.2); + --border-hover: rgba(128, 128, 128, 0.4); + --accent-primary: #5191ee; /* Updated to match requirement */ + --accent-primary-hover: #4179d8; /* Darker shade for hover */ + --accent-secondary: currentColor; + --accent-success: #10b981; + --accent-warning: #f59e0b; + --accent-error: #ef4444; + --shadow-sm: 0 1px 3px rgba(0, 0, 0, 0.1); + --shadow-md: 0 4px 6px rgba(0, 0, 0, 0.1); + --shadow-lg: 0 10px 15px rgba(0, 0, 0, 0.1); +} + +/* Dark theme adjustments for Sphinx dark theme */ +[data-theme="dark"], html[data-theme="dark"] { + --bg-secondary: rgba(30, 41, 59, 0.6); + --bg-card: rgba(30, 41, 59, 0.8); + --border-color: rgba(255, 255, 255, 0.1); + --border-hover: rgba(255, 255, 255, 0.2); + --shadow-sm: 0 1px 3px rgba(0, 0, 0, 0.3); + --shadow-md: 0 4px 6px rgba(0, 0, 0, 0.3); + --shadow-lg: 0 10px 15px rgba(0, 0, 0, 0.3); +} + +/* Reset and base styles */ +* { + margin: 0; + padding: 0; + box-sizing: border-box; +} + +body { + font-family: '微软雅黑', 'Microsoft YaHei', 'Times New Roman', sans-serif; + background-color: var(--bg-primary); + color: var(--text-primary); + line-height: 1.6; +} + +/* Layout components */ +.container { + max-width: 800px; + margin: 0 auto; + padding: 0 0.5rem; +} + +.main-content { + display: grid; + grid-template-columns: 1fr 1fr; + gap: 1rem; + margin-bottom: 1rem; +} + +@media (max-width: 768px) { + .main-content { + grid-template-columns: 1fr; + } +} + +.card { + background-color: var(--bg-card); + border: 1px solid var(--border-color); + border-radius: 12px; + padding: 1.5rem; + box-shadow: var(--shadow-md); +} + + +/* Typography */ +.title { + font-size: 1.5rem; + font-weight: 700; + color: var(--text-primary); + margin-bottom: 0.3rem; +} + +.subtitle { + font-size: 0.9rem; + color: var(--text-secondary); + margin-bottom: 1rem; +} + +.section-title { + font-size: 1.1rem; + font-weight: 600; + color: var(--text-primary); + margin-bottom: 1rem; + display: flex; + align-items: center; + gap: 0.5rem; +} + +/* Form elements */ +.form-group { + margin-bottom: 1rem; +} + +.form-label { + display: block; + font-size: 0.8rem; + font-weight: 500; + color: var(--text-primary); + margin-bottom: 0.3rem; +} + +.form-select { + width: 100%; + padding: 0.75rem 1rem; + border: 1px solid var(--border-color); + border-radius: 8px; + background-color: var(--bg-primary); + color: var(--text-primary); + font-size: 0.875rem; +} + +.form-select:focus { + outline: none; + border-color: var(--accent-primary); + box-shadow: 0 0 0 3px rgba(81, 145, 238, 0.1); /* Updated to use new blue color */ +} + +.form-input { + width: 100%; + padding: 0.75rem 1rem; + border: 1px solid var(--border-color); + border-radius: 8px; + background-color: var(--bg-primary); + color: var(--text-primary); + font-size: 0.875rem; +} + +.form-input:focus { + outline: none; + border-color: var(--accent-primary); + box-shadow: 0 0 0 3px rgba(81, 145, 238, 0.1); /* Updated to use new blue color */ +} + +.form-input::placeholder { + color: var(--text-secondary); + opacity: 0.5; +} + +/* Buttons */ +.btn { + padding: 0.75rem 1.5rem; + border: none; + border-radius: 8px; + font-size: 0.875rem; + font-weight: 500; + cursor: pointer; + display: inline-flex; + align-items: center; + justify-content: center; + gap: 0.5rem; + text-decoration: none; + min-height: 44px; +} + +.btn-primary { + background: #5191ee; /* Unified blue color - removed gradient */ + color: white; +} + + +.btn-secondary { + background: #5191ee; /* Unified blue color - removed gradient */ + color: white; +} + + +.btn-group { + display: flex; + gap: 0.75rem; + flex-wrap: wrap; +} + +.btn-group .btn { + flex: 1; + min-width: 0; +} + +/* Model source selector */ +.model-source-selector { + display: flex; + gap: 0.25rem; + background: var(--bg-secondary); + padding: 0.25rem; + border-radius: 8px; + border: 1px solid var(--border-color); +} + +.model-source-option { + flex: 1; + padding: 0.5rem 1rem; + text-align: center; + border-radius: 6px; + font-size: 0.875rem; + font-weight: 500; + cursor: pointer; + background: transparent; + color: var(--text-secondary); + border: 1px solid transparent; +} + +.model-source-option:hover { + color: var(--text-primary); + background: var(--bg-card); +} + +.model-source-option.active { + background: #5191ee; /* Unified blue color - removed gradient */ + color: white; + border-color: #4179d8; +} + + +/* Results display */ +.result-display { + text-align: center; + padding: 1.5rem; + background: var(--bg-secondary); + border-radius: 8px; + margin-bottom: 1rem; +} + +.result-value { + font-size: 2rem; + font-weight: 700; + color: var(--accent-primary); + margin-bottom: 0.5rem; +} + +.result-label { + font-size: 0.9rem; + color: var(--text-secondary); +} + +.metrics-row { + display: flex; + flex-wrap: wrap; + gap: 0.75rem; + margin-bottom: 1rem; +} + +.metric-item { + background: var(--bg-secondary); + padding: 0.5rem 0.75rem; + border-radius: 6px; + font-size: 0.8rem; + display: flex; + align-items: center; + gap: 0.25rem; +} + +/* Calculation steps - Ultra compact design */ +.calculation-steps { + background: linear-gradient(135deg, var(--bg-secondary) 0%, rgba(81, 145, 238, 0.05) 100%); + border-radius: 12px; + padding: 0.75rem; + margin-top: 0.75rem; + border: 1px solid var(--border-color); +} + +.formula-card { + background: var(--bg-card); + border-radius: 8px; + padding: 0.625rem 0.875rem; + margin-bottom: 0.625rem; + border-left: 3px solid var(--accent-primary); +} + +.formula-header { + display: flex; + align-items: center; + gap: 0.5rem; + margin-bottom: 0.375rem; + font-weight: 600; + color: var(--text-primary); + font-size: 0.8rem; +} + +.formula-content { + background: var(--bg-secondary); + border-radius: 6px; + padding: 0.5rem 0.625rem; + font-family: 'Consolas', 'Monaco', 'Courier New', monospace; + font-size: 0.75rem; + line-height: 1.4; + color: var(--text-primary); +} + +.formula-main { + color: var(--accent-primary); + font-weight: 600; + margin-bottom: 0.25rem; + font-size: 0.8rem; +} + +.formula-breakdown { + margin-top: 0.25rem; + padding-top: 0.25rem; + border-top: 1px dashed var(--border-color); +} + +.formula-step { + display: flex; + align-items: baseline; + margin-bottom: 0.125rem; +} + +.formula-step-label { + color: var(--text-secondary); + margin-right: 0.5rem; + flex-shrink: 0; + font-size: 0.7rem; +} + +.formula-step-value { + color: var(--text-primary); + font-weight: 500; + font-size: 0.7rem; +} + +.calculation-section { + border-top: 1px solid var(--border-color); + padding-top: 1rem; + margin-top: 1rem; +} + +/* URL Format Guide */ +.url-format-guide { + transition: all 0.2s ease; +} + +.url-format-guide:hover { + border-color: var(--accent-primary); + box-shadow: 0 2px 8px rgba(81, 145, 238, 0.1); +} + +/* Toast notifications */ +.toast-container { + position: fixed; + top: 1rem; + right: 1rem; + z-index: 1000; + display: flex; + flex-direction: column; + gap: 0.5rem; + max-width: 400px; +} + +.toast { + background: var(--bg-card); + border: 1px solid var(--border-color); + border-radius: 8px; + padding: 1rem; + box-shadow: var(--shadow-lg); + display: flex; + align-items: center; + gap: 0.75rem; + max-width: 100%; +} + +.toast.show { + display: flex; +} + +.toast.hide { + display: none; +} + +.toast-content { + display: flex; + align-items: center; + gap: 0.75rem; + flex: 1; +} + +.toast-icon { + font-size: 1.2rem; + flex-shrink: 0; +} + +.toast-info { + flex: 1; + min-width: 0; +} + +.toast-title { + font-weight: 600; + color: var(--text-primary); + margin-bottom: 0.25rem; + font-size: 0.9rem; +} + +.toast-message { + color: var(--text-secondary); + font-size: 0.8rem; + word-wrap: break-word; +} + +.toast-close { + background: none; + border: none; + color: var(--text-secondary); + cursor: pointer; + font-size: 1.2rem; + padding: 0.25rem; + display: flex; + align-items: center; + justify-content: center; + border-radius: 4px; + flex-shrink: 0; +} + +.toast-close:hover { + background: var(--bg-secondary); + color: var(--text-primary); +} + +/* Utilities */ +.hidden { + display: none !important; +} + +.text-sm { + font-size: 0.8rem; +} + +.text-secondary { + color: var(--text-secondary); +} + +.w-full { + width: 100%; +} + + +/* Responsive design */ +@media (max-width: 768px) { + .container { + padding: 0 0.25rem; + } + + .card { + padding: 1rem; + } + + .btn-group { + flex-direction: column; + } + + .btn-group .btn { + min-width: auto; + } + + .model-source-selector { + flex-direction: column; + gap: 0.5rem; + } + + .metrics-row { + flex-direction: column; + gap: 0.5rem; + } + + .toast-container { + top: 0.5rem; + right: 0.5rem; + left: 0.5rem; + max-width: none; + } + + .modal-content { + margin: 0.5rem; + } +} + +@media (max-width: 480px) { + .card { + padding: 0.75rem; + } + + .btn { + padding: 0.625rem 1rem; + font-size: 0.8rem; + } + + .form-select, + .form-input { + padding: 0.625rem 0.875rem; + font-size: 0.8rem; + } +} + + +/* Focus visible for accessibility */ +.btn:focus-visible, +.form-select:focus-visible, +.form-input:focus-visible { + outline: 2px solid var(--accent-primary); + outline-offset: 2px; +}