feat(docker): update Dockerfile and add new vLLM configurations

asdek · asdek · commit c068d86bf0f9 · 2026-05-02T18:57:42.000+03:00
- Added new provider configurations for vLLM Qwen 3.6 in both thinking and non-thinking modes.
- Updated the Dockerfile to include the new configuration files for vLLM Qwen 3.6 and ensure proper setup for deployment.
diff --git a/Dockerfile b/Dockerfile
@@ -171,8 +171,10 @@ COPY examples/configs/ollama-qwen332b-fp16-tc.provider.yml /opt/pentagi/conf/
 COPY examples/configs/ollama-qwq32b-fp16-tc.provider.yml /opt/pentagi/conf/
 COPY examples/configs/openrouter.provider.yml /opt/pentagi/conf/
 COPY examples/configs/novita.provider.yml /opt/pentagi/conf/
-COPY examples/configs/vllm-qwen3.5-27b-fp8.provider.yml /opt/pentagi/conf/
 COPY examples/configs/vllm-qwen3.5-27b-fp8-no-think.provider.yml /opt/pentagi/conf/
+COPY examples/configs/vllm-qwen3.5-27b-fp8.provider.yml /opt/pentagi/conf/
+COPY examples/configs/vllm-qwen3.6-27b-fp8-no-think.provider.yml /opt/pentagi/conf/
+COPY examples/configs/vllm-qwen3.6-27b-fp8.provider.yml /opt/pentagi/conf/
 COPY examples/configs/vllm-qwen332b-fp16.provider.yml /opt/pentagi/conf/
 
 COPY LICENSE /opt/pentagi/LICENSE
diff --git a/examples/configs/vllm-qwen3.6-27b-fp8-no-think.provider.yml b/examples/configs/vllm-qwen3.6-27b-fp8-no-think.provider.yml
@@ -0,0 +1,193 @@
+# Qwen3.6-27B FP8 Provider Configuration - NON-THINKING MODE
+# Based on official Qwen recommendations for vLLM inference
+# Architecture: Hybrid 75% DeltaNet + 25% Full Attention (48+16 layers)
+# Context: 262K native, expandable to 1M with YaRN
+# Vision: VLM with Vision Encoder (uses VRAM even for text-only tasks)
+#
+# Non-thinking mode is disabled via extra_body parameter
+# Recommended sampling parameters:
+# - General tasks: temp=0.7, top_p=0.8, top_k=20, min_p=0.0, pp=1.5, rp=1.0
+# - Reasoning tasks: temp=1.0, top_p=0.95, top_k=20, min_p=0.0, pp=1.5, rp=1.0
+
+simple:
+  model: "Qwen/Qwen3.6-27B-FP8"
+  temperature: 0.7
+  top_k: 20
+  top_p: 0.8
+  min_p: 0.0
+  presence_penalty: 1.5
+  repetition_penalty: 1.0
+  n: 1
+  max_tokens: 32768
+  extra_body:
+    chat_template_kwargs:
+      enable_thinking: false
+
+simple_json:
+  model: "Qwen/Qwen3.6-27B-FP8"
+  temperature: 0.7
+  top_k: 20
+  top_p: 0.8
+  min_p: 0.0
+  presence_penalty: 1.5
+  repetition_penalty: 1.0
+  n: 1
+  max_tokens: 32768
+  json: true
+  extra_body:
+    chat_template_kwargs:
+      enable_thinking: false
+
+primary_agent:
+  model: "Qwen/Qwen3.6-27B-FP8"
+  temperature: 1.0
+  top_k: 20
+  top_p: 0.95
+  min_p: 0.0
+  presence_penalty: 1.5
+  repetition_penalty: 1.0
+  n: 1
+  max_tokens: 32768
+  extra_body:
+    chat_template_kwargs:
+      enable_thinking: false
+
+assistant:
+  model: "Qwen/Qwen3.6-27B-FP8"
+  temperature: 1.0
+  top_k: 20
+  top_p: 0.95
+  min_p: 0.0
+  presence_penalty: 1.5
+  repetition_penalty: 1.0
+  n: 1
+  max_tokens: 32768
+  extra_body:
+    chat_template_kwargs:
+      enable_thinking: false
+
+generator:
+  model: "Qwen/Qwen3.6-27B-FP8"
+  temperature: 1.0
+  top_k: 20
+  top_p: 0.95
+  min_p: 0.0
+  presence_penalty: 1.5
+  repetition_penalty: 1.0
+  n: 1
+  max_tokens: 32768
+  extra_body:
+    chat_template_kwargs:
+      enable_thinking: false
+
+refiner:
+  model: "Qwen/Qwen3.6-27B-FP8"
+  temperature: 1.0
+  top_k: 20
+  top_p: 0.95
+  min_p: 0.0
+  presence_penalty: 1.5
+  repetition_penalty: 1.0
+  n: 1
+  max_tokens: 32768
+  extra_body:
+    chat_template_kwargs:
+      enable_thinking: false
+
+adviser:
+  model: "Qwen/Qwen3.6-27B-FP8"
+  temperature: 1.0
+  top_k: 20
+  top_p: 0.95
+  min_p: 0.0
+  presence_penalty: 1.5
+  repetition_penalty: 1.0
+  n: 1
+  max_tokens: 32768
+  extra_body:
+    chat_template_kwargs:
+      enable_thinking: false
+
+reflector:
+  model: "Qwen/Qwen3.6-27B-FP8"
+  temperature: 1.0
+  top_k: 20
+  top_p: 0.95
+  min_p: 0.0
+  presence_penalty: 1.5
+  repetition_penalty: 1.0
+  n: 1
+  max_tokens: 32768
+  extra_body:
+    chat_template_kwargs:
+      enable_thinking: false
+
+searcher:
+  model: "Qwen/Qwen3.6-27B-FP8"
+  temperature: 0.7
+  top_k: 20
+  top_p: 0.8
+  min_p: 0.0
+  presence_penalty: 1.5
+  repetition_penalty: 1.0
+  n: 1
+  max_tokens: 32768
+  extra_body:
+    chat_template_kwargs:
+      enable_thinking: false
+
+enricher:
+  model: "Qwen/Qwen3.6-27B-FP8"
+  temperature: 0.7
+  top_k: 20
+  top_p: 0.8
+  min_p: 0.0
+  presence_penalty: 1.5
+  repetition_penalty: 1.0
+  n: 1
+  max_tokens: 32768
+  extra_body:
+    chat_template_kwargs:
+      enable_thinking: false
+
+coder:
+  model: "Qwen/Qwen3.6-27B-FP8"
+  temperature: 1.0
+  top_k: 20
+  top_p: 0.95
+  min_p: 0.0
+  presence_penalty: 1.5
+  repetition_penalty: 1.0
+  n: 1
+  max_tokens: 32768
+  extra_body:
+    chat_template_kwargs:
+      enable_thinking: false
+
+installer:
+  model: "Qwen/Qwen3.6-27B-FP8"
+  temperature: 1.0
+  top_k: 20
+  top_p: 0.95
+  min_p: 0.0
+  presence_penalty: 1.5
+  repetition_penalty: 1.0
+  n: 1
+  max_tokens: 32768
+  extra_body:
+    chat_template_kwargs:
+      enable_thinking: false
+
+pentester:
+  model: "Qwen/Qwen3.6-27B-FP8"
+  temperature: 1.0
+  top_k: 20
+  top_p: 0.95
+  min_p: 0.0
+  presence_penalty: 1.5
+  repetition_penalty: 1.0
+  n: 1
+  max_tokens: 32768
+  extra_body:
+    chat_template_kwargs:
+      enable_thinking: false
diff --git a/examples/configs/vllm-qwen3.6-27b-fp8.provider.yml b/examples/configs/vllm-qwen3.6-27b-fp8.provider.yml
@@ -0,0 +1,174 @@
+# Qwen3.6-27B FP8 Provider Configuration - THINKING MODE (default)
+# Based on official Qwen recommendations for vLLM inference
+# Architecture: Hybrid 75% DeltaNet + 25% Full Attention (48+16 layers)
+# Context: 262K native, expandable to 1M with YaRN
+# Vision: VLM with Vision Encoder (uses VRAM even for text-only tasks)
+#
+# Thinking mode is enabled by default (no extra_body needed)
+# Recommended sampling parameters:
+# - General tasks: temp=1.0, top_p=0.95, top_k=20, min_p=0.0, pp=1.5, rp=1.0
+# - Precise coding: temp=0.6, top_p=0.95, top_k=20, min_p=0.0, pp=0.0, rp=1.0
+#
+# Non-thinking mode is disabled via extra_body parameter
+# Recommended sampling parameters:
+# - General tasks: temp=0.7, top_p=0.8, top_k=20, min_p=0.0, pp=1.5, rp=1.0
+# - Reasoning tasks: temp=1.0, top_p=0.95, top_k=20, min_p=0.0, pp=1.5, rp=1.0
+
+simple:
+  model: "Qwen/Qwen3.6-27B-FP8"
+  temperature: 0.7
+  top_k: 20
+  top_p: 0.8
+  min_p: 0.0
+  presence_penalty: 1.5
+  repetition_penalty: 1.0
+  n: 1
+  max_tokens: 32768
+  extra_body:
+    chat_template_kwargs:
+      enable_thinking: false
+
+simple_json:
+  model: "Qwen/Qwen3.6-27B-FP8"
+  temperature: 0.7
+  top_k: 20
+  top_p: 0.8
+  min_p: 0.0
+  presence_penalty: 1.5
+  repetition_penalty: 1.0
+  n: 1
+  max_tokens: 32768
+  json: true
+  extra_body:
+    chat_template_kwargs:
+      enable_thinking: false
+
+primary_agent:
+  model: "Qwen/Qwen3.6-27B-FP8"
+  temperature: 1.0
+  top_k: 20
+  top_p: 0.95
+  min_p: 0.0
+  presence_penalty: 1.5
+  repetition_penalty: 1.0
+  n: 1
+  max_tokens: 32768
+
+assistant:
+  model: "Qwen/Qwen3.6-27B-FP8"
+  temperature: 1.0
+  top_k: 20
+  top_p: 0.95
+  min_p: 0.0
+  presence_penalty: 1.5
+  repetition_penalty: 1.0
+  n: 1
+  max_tokens: 32768
+
+generator:
+  model: "Qwen/Qwen3.6-27B-FP8"
+  temperature: 1.0
+  top_k: 20
+  top_p: 0.95
+  min_p: 0.0
+  presence_penalty: 1.5
+  repetition_penalty: 1.0
+  n: 1
+  max_tokens: 32768
+
+refiner:
+  model: "Qwen/Qwen3.6-27B-FP8"
+  temperature: 1.0
+  top_k: 20
+  top_p: 0.95
+  min_p: 0.0
+  presence_penalty: 1.5
+  repetition_penalty: 1.0
+  n: 1
+  max_tokens: 32768
+
+adviser:
+  model: "Qwen/Qwen3.6-27B-FP8"
+  temperature: 1.0
+  top_k: 20
+  top_p: 0.95
+  min_p: 0.0
+  presence_penalty: 1.5
+  repetition_penalty: 1.0
+  n: 1
+  max_tokens: 32768
+
+reflector:
+  model: "Qwen/Qwen3.6-27B-FP8"
+  temperature: 1.0
+  top_k: 20
+  top_p: 0.95
+  min_p: 0.0
+  presence_penalty: 1.5
+  repetition_penalty: 1.0
+  n: 1
+  max_tokens: 32768
+  extra_body:
+    chat_template_kwargs:
+      enable_thinking: false
+
+searcher:
+  model: "Qwen/Qwen3.6-27B-FP8"
+  temperature: 0.7
+  top_k: 20
+  top_p: 0.8
+  min_p: 0.0
+  presence_penalty: 1.5
+  repetition_penalty: 1.0
+  n: 1
+  max_tokens: 32768
+  extra_body:
+    chat_template_kwargs:
+      enable_thinking: false
+
+enricher:
+  model: "Qwen/Qwen3.6-27B-FP8"
+  temperature: 0.7
+  top_k: 20
+  top_p: 0.8
+  min_p: 0.0
+  presence_penalty: 1.5
+  repetition_penalty: 1.0
+  n: 1
+  max_tokens: 32768
+  extra_body:
+    chat_template_kwargs:
+      enable_thinking: false
+
+coder:
+  model: "Qwen/Qwen3.6-27B-FP8"
+  temperature: 0.6
+  top_k: 20
+  top_p: 0.95
+  min_p: 0.0
+  presence_penalty: 0.0
+  repetition_penalty: 1.0
+  n: 1
+  max_tokens: 32768
+
+installer:
+  model: "Qwen/Qwen3.6-27B-FP8"
+  temperature: 0.6
+  top_k: 20
+  top_p: 0.95
+  min_p: 0.0
+  presence_penalty: 0.0
+  repetition_penalty: 1.0
+  n: 1
+  max_tokens: 32768
+
+pentester:
+  model: "Qwen/Qwen3.6-27B-FP8"
+  temperature: 0.6
+  top_k: 20
+  top_p: 0.95
+  min_p: 0.0
+  presence_penalty: 0.0
+  repetition_penalty: 1.0
+  n: 1
+  max_tokens: 32768