dphnAI
diff --git a/‎aphrodite/config/speculative.py‎
Lines changed: 7 additions & 0 deletions b/‎aphrodite/config/speculative.py‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎aphrodite/modeling/layers/linear.py‎
Lines changed: 66 additions & 13 deletions b/‎aphrodite/modeling/layers/linear.py‎
Lines changed: 66 additions & 13 deletions
diff --git a/‎aphrodite/modeling/layers/mamba/abstract.py‎
Lines changed: 1 addition & 1 deletion b/‎aphrodite/modeling/layers/mamba/abstract.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎aphrodite/modeling/layers/mamba/mamba_utils.py‎
Lines changed: 7 additions & 2 deletions b/‎aphrodite/modeling/layers/mamba/mamba_utils.py‎
Lines changed: 7 additions & 2 deletions
diff --git a/‎aphrodite/modeling/models/config.py‎
Lines changed: 13 additions & 0 deletions b/‎aphrodite/modeling/models/config.py‎
Lines changed: 13 additions & 0 deletions
@@ -34,6 +34,7 @@
     "deepseek_mtp",
     "ernie_mtp",
     "qwen3_next_mtp",
+    "qwen3_5_mtp",
     "mimo_mtp",
     "longcat_flash_mtp",
     "mtp",
@@ -46,6 +47,7 @@
     "glm4_moe_lite_mtp",
     "ernie_mtp",
     "qwen3_next_mtp",
+    "qwen3_5_mtp",
     "longcat_flash_mtp",
 )
 
@@ -218,6 +220,11 @@ def hf_config_override(hf_config: PretrainedConfig) -> PretrainedConfig:
         if hf_config.model_type == "qwen3_next_mtp":
             n_predict = getattr(hf_config, "num_nextn_predict_layers", None)
             hf_config.update({"n_predict": n_predict, "architectures": ["Qwen3NextMTP"]})
+        if hf_config.model_type in ("qwen3_5", "qwen3_5_moe"):
+            hf_config.model_type = "qwen3_5_mtp"
+        if hf_config.model_type == "qwen3_5_mtp":
+            n_predict = getattr(hf_config.text_config, "mtp_num_hidden_layers", 1)
+            hf_config.update({"n_predict": n_predict, "architectures": ["Qwen3_5MTP"]})
         if hf_config.model_type == "longcat_flash":
             hf_config.model_type = "longcat_flash_mtp"
             n_predict = getattr(hf_config, "num_nextn_predict_layers", 1)
 
@@ -72,6 +72,14 @@ def adjust_marlin_shard(param, shard_size, shard_offset):
     return shard_size * marlin_tile_size, shard_offset * marlin_tile_size
 
 
+def adjust_block_scale_shard(weight_block_size, shard_size, shard_offset):
+    assert weight_block_size is not None
+    block_n = weight_block_size[0]
+    shard_offset = (shard_offset + block_n - 1) // block_n
+    shard_size = (shard_size + block_n - 1) // block_n
+    return shard_size, shard_offset
+
+
 def adjust_bitsandbytes_4bit_shard(
     param: Parameter, shard_offsets: dict[str, tuple[int, int]], loaded_shard_id: str
 ) -> tuple[int, int]:
@@ -744,7 +752,12 @@ def weight_loader(
         assert param_data.shape == loaded_weight.shape
         param_data.copy_(loaded_weight)
 
-    def _load_fused_module_from_checkpoint(self, param: BaseAphroditeParameter, loaded_weight: torch.Tensor):
+    def _load_fused_module_from_checkpoint(
+        self,
+        param: BaseAphroditeParameter,
+        loaded_weight: torch.Tensor,
+        output_sizes: list[int] | None = None,
+    ):
         """
         Handle special case for models where MLP layers are already
         fused on disk. In this case, we have no shard id. This function
@@ -757,7 +770,8 @@ def _load_fused_module_from_checkpoint(self, param: BaseAphroditeParameter, load
 
         current_shard_offset = 0
         shard_offsets: list[tuple[int, int, int]] = []
-        for i, output_size in enumerate(self.output_sizes):
+        output_sizes = output_sizes or self.output_sizes
+        for i, output_size in enumerate(output_sizes):
             shard_offsets.append((i, current_shard_offset, output_size))
             current_shard_offset += output_size
 
@@ -776,37 +790,76 @@ def _load_fused_module_from_checkpoint(self, param: BaseAphroditeParameter, load
             loaded_weight_shard = loaded_weight.narrow(param.output_dim, shard_offset, shard_size)
             self.weight_loader_v2(param, loaded_weight_shard, shard_id)
 
+    def validate_shard_id(self, loaded_shard_id: int | tuple[int, ...] | None):
+        if loaded_shard_id is None:
+            return
+        if isinstance(loaded_shard_id, tuple):
+            for idx in loaded_shard_id:
+                if not (0 <= idx < len(self.output_sizes)):
+                    raise ValueError(
+                        f"Shard id index {idx} should be between 0 and "
+                        f"{len(self.output_sizes) - 1}. Got shard id {loaded_shard_id}."
+                    )
+            if len(loaded_shard_id) > 1 and any(
+                b - a != 1 for a, b in zip(loaded_shard_id[:-1], loaded_shard_id[1:])
+            ):
+                raise ValueError(
+                    "Shard id with multiple indices should be consecutive. "
+                    f"Got shard id {loaded_shard_id}."
+                )
+            return
+        if isinstance(loaded_shard_id, int):
+            if loaded_shard_id < 0 or loaded_shard_id >= len(self.output_sizes):
+                raise ValueError(
+                    f"Shard id should be between 0 and {len(self.output_sizes) - 1}. "
+                    f"Got shard id {loaded_shard_id}."
+                )
+            return
+        raise ValueError("This line should not be reached")
+
     def weight_loader_v2(
         self,
         param: BaseAphroditeParameter,
         loaded_weight: torch.Tensor,
-        loaded_shard_id: int | None = None,
+        loaded_shard_id: tuple[int, ...] | int | None = None,
     ):
-        if loaded_shard_id is None:
+        self.validate_shard_id(loaded_shard_id)
+        if loaded_shard_id is None or isinstance(loaded_shard_id, tuple):
             if isinstance(param, PerTensorScaleParameter):
                 param.load_merged_column_weight(loaded_weight=loaded_weight, shard_id=0)
                 return
             elif type(param) in (RowAphroditeParameter, BaseAphroditeParameter):
                 param.load_merged_column_weight(loaded_weight=loaded_weight)
                 return
-            # TODO: @dsikka - move to parameter.py
-            self._load_fused_module_from_checkpoint(param, loaded_weight)
+            output_sizes = (
+                [self.output_sizes[idx] for idx in loaded_shard_id]
+                if loaded_shard_id
+                else None
+            )
+            if isinstance(param, BlockQuantScaleParameter):
+                weight_block_size = getattr(self, "weight_block_size", None)
+                output_sizes = [
+                    adjust_block_scale_shard(weight_block_size, size, 0)[0]
+                    for size in (output_sizes or self.output_sizes)
+                ]
+            self._load_fused_module_from_checkpoint(
+                param, loaded_weight, output_sizes=output_sizes
+            )
             return
 
         assert loaded_shard_id < len(self.output_sizes)
 
+        shard_offset = sum(self.output_sizes[:loaded_shard_id]) // self.tp_size
+        shard_size = self.output_sizes[loaded_shard_id] // self.tp_size
+
         if isinstance(param, BlockQuantScaleParameter):
             assert self.quant_method is not None
-            # Assume the weight block size has been set by quant method
             assert hasattr(self, "weight_block_size")
             weight_block_size = self.weight_block_size
             assert weight_block_size is not None
-            block_n, _ = weight_block_size[0], weight_block_size[1]
-            shard_offset = ((sum(self.output_sizes[:loaded_shard_id]) + block_n - 1) // block_n) // self.tp_size
-            shard_size = (self.output_sizes[loaded_shard_id] + block_n - 1) // block_n // self.tp_size
-        else:
-            shard_offset = sum(self.output_sizes[:loaded_shard_id]) // self.tp_size
-            shard_size = self.output_sizes[loaded_shard_id] // self.tp_size
+            shard_size, shard_offset = adjust_block_scale_shard(
+                weight_block_size, shard_size, shard_offset
+            )
 
         param.load_merged_column_weight(
             loaded_weight=loaded_weight,
 
@@ -48,7 +48,7 @@ def get_state_dtype(self) -> tuple[torch.dtype, ...]:
     def get_kv_cache_spec(self, aphrodite_config: AphroditeConfig) -> KVCacheSpec | None:
         if (
             aphrodite_config.speculative_config is not None
-            and aphrodite_config.model_config.hf_config.model_type not in ["qwen3_next"]
+            and aphrodite_config.model_config.hf_config.model_type not in ["qwen3_next", "qwen3_5", "qwen3_5_moe"]
         ):
             raise NotImplementedError("Mamba with speculative decoding is not supported yet.")
         mamba_block_size = aphrodite_config.cache_config.mamba_block_size
 
@@ -66,9 +66,14 @@ def gated_delta_net_state_dtype(
         cls,
         model_dtype: ModelDType | torch.dtype,
         mamba_cache_dtype: MambaDType,
+        mamba_ssm_cache_dtype: MambaDType = "auto",
     ) -> tuple[torch.dtype, torch.dtype]:
-        state_dtype = get_kv_cache_torch_dtype(mamba_cache_dtype, model_dtype)
-        return (state_dtype, state_dtype)
+        conv_state_dtype = get_kv_cache_torch_dtype(mamba_cache_dtype, model_dtype)
+        if mamba_ssm_cache_dtype == "auto":
+            temporal_state_dtype = conv_state_dtype
+        else:
+            temporal_state_dtype = STR_DTYPE_TO_TORCH_DTYPE[mamba_ssm_cache_dtype]
+        return (conv_state_dtype, temporal_state_dtype)
 
     @classmethod
     def kda_state_dtype(
 
@@ -457,6 +457,17 @@ def verify_and_update_config(cls, aphrodite_config: "AphroditeConfig") -> None:
             logger.info("Using bfloat16 kv-cache for DeepSeekV3.2")
 
 
+class Qwen3_5ForConditionalGenerationConfig(HybridAttentionMambaModelConfig):
+    @classmethod
+    def verify_and_update_config(cls, aphrodite_config: "AphroditeConfig") -> None:
+        super().verify_and_update_config(aphrodite_config)
+
+        cache_config = aphrodite_config.cache_config
+        if cache_config.mamba_ssm_cache_dtype == "auto":
+            text_config = aphrodite_config.model_config.hf_text_config
+            cache_config.mamba_ssm_cache_dtype = getattr(text_config, "mamba_ssm_dtype", "float32")
+
+
 MODELS_CONFIG_MAP: dict[str, type[VerifyAndUpdateConfig]] = {
     "GteModel": SnowflakeGteNewModelConfig,
     "GteNewModel": GteNewModelConfig,
@@ -474,4 +485,6 @@ def verify_and_update_config(cls, aphrodite_config: "AphroditeConfig") -> None:
     "Mamba2ForCausalLM": MambaModelConfig,
     "FalconMambaForCausalLM": MambaModelConfig,
     "DeepseekV32ForCausalLM": DeepseekV32ForCausalLM,
+    "Qwen3_5ForConditionalGeneration": Qwen3_5ForConditionalGenerationConfig,
+    "Qwen3_5MoeForConditionalGeneration": Qwen3_5ForConditionalGenerationConfig,
 }