up

metascroy · metascroy · commit 8f047dd40e07 · 2026-04-07T14:34:40.000-07:00
diff --git a/examples/models/parakeet/export_parakeet_tdt.py b/examples/models/parakeet/export_parakeet_tdt.py
@@ -640,7 +640,15 @@ def main():
         "--backend",
         type=str,
         default="xnnpack",
-        choices=["portable", "xnnpack", "metal", "mlx", "cuda", "cuda-windows", "vulkan"],
+        choices=[
+            "portable",
+            "xnnpack",
+            "metal",
+            "mlx",
+            "cuda",
+            "cuda-windows",
+            "vulkan",
+        ],
         help="Backend for acceleration (default: xnnpack)",
     )
     parser.add_argument(
diff --git a/examples/models/voxtral_realtime/model.py b/examples/models/voxtral_realtime/model.py
@@ -700,7 +700,10 @@ def __init__(self, config: VoxtralRealtimeConfig):
             if self.backend == "mlx":
                 cache_dtype = self.wq.weight.dtype
                 self.kv_cache = MLXKVCache(
-                    config.sliding_window, self.n_kv_heads, self.head_dim, dtype=cache_dtype
+                    config.sliding_window,
+                    self.n_kv_heads,
+                    self.head_dim,
+                    dtype=cache_dtype,
                 )
                 self.sdpa = MLXSDPA(self.n_heads, self.head_dim)
             elif self.backend == "metal":
@@ -1170,7 +1173,7 @@ def __init__(self, model: VoxtralRealtimeModel, max_enc_len: int = 750):
         elif config.backend == "metal":
             self.kv_caches = nn.ModuleList(
                 [
-                    StandardEncoderRingKVCache(
+                    StandardRingKVCache(
                         max_enc_len, config.enc_n_heads, config.enc_head_dim
                     )
                     for _ in range(config.enc_n_layers)
@@ -1184,7 +1187,7 @@ def __init__(self, model: VoxtralRealtimeModel, max_enc_len: int = 750):
         elif config.backend == "cuda":
             self.kv_caches = nn.ModuleList(
                 [
-                    StandardEncoderRingKVCache(
+                    StandardRingKVCache(
                         max_enc_len, config.enc_n_heads, config.enc_head_dim
                     )
                     for _ in range(config.enc_n_layers)
@@ -1198,9 +1201,7 @@ def __init__(self, model: VoxtralRealtimeModel, max_enc_len: int = 750):
         else:
             self.kv_caches = nn.ModuleList(
                 [
-                    EncoderRingKVCache(
-                        max_enc_len, config.enc_n_heads, config.enc_head_dim
-                    )
+                    RingKVCache(max_enc_len, config.enc_n_heads, config.enc_head_dim)
                     for _ in range(config.enc_n_layers)
                 ]
             )

Original file line number	Diff line number	Diff line change
`@@ -700,7 +700,10 @@ def __init__(self, config: VoxtralRealtimeConfig):`
`700`	`700`	`if self.backend == "mlx":`
`701`	`701`	`cache_dtype = self.wq.weight.dtype`
`702`	`702`	`self.kv_cache = MLXKVCache(`
`703`		`- config.sliding_window, self.n_kv_heads, self.head_dim, dtype=cache_dtype`
	`703`	`+ config.sliding_window,`
	`704`	`+ self.n_kv_heads,`
	`705`	`+ self.head_dim,`
	`706`	`+ dtype=cache_dtype,`
`704`	`707`	`)`
`705`	`708`	`self.sdpa = MLXSDPA(self.n_heads, self.head_dim)`
`706`	`709`	`elif self.backend == "metal":`
`@@ -1170,7 +1173,7 @@ def __init__(self, model: VoxtralRealtimeModel, max_enc_len: int = 750):`
`1170`	`1173`	`elif config.backend == "metal":`
`1171`	`1174`	`self.kv_caches = nn.ModuleList(`
`1172`	`1175`	`[`
`1173`		`- StandardEncoderRingKVCache(`
	`1176`	`+ StandardRingKVCache(`
`1174`	`1177`	`max_enc_len, config.enc_n_heads, config.enc_head_dim`
`1175`	`1178`	`)`
`1176`	`1179`	`for _ in range(config.enc_n_layers)`
`@@ -1184,7 +1187,7 @@ def __init__(self, model: VoxtralRealtimeModel, max_enc_len: int = 750):`
`1184`	`1187`	`elif config.backend == "cuda":`
`1185`	`1188`	`self.kv_caches = nn.ModuleList(`
`1186`	`1189`	`[`
`1187`		`- StandardEncoderRingKVCache(`
	`1190`	`+ StandardRingKVCache(`
`1188`	`1191`	`max_enc_len, config.enc_n_heads, config.enc_head_dim`
`1189`	`1192`	`)`
`1190`	`1193`	`for _ in range(config.enc_n_layers)`
`@@ -1198,9 +1201,7 @@ def __init__(self, model: VoxtralRealtimeModel, max_enc_len: int = 750):`
`1198`	`1201`	`else:`
`1199`	`1202`	`self.kv_caches = nn.ModuleList(`
`1200`	`1203`	`[`
`1201`		`- EncoderRingKVCache(`
`1202`		`- max_enc_len, config.enc_n_heads, config.enc_head_dim`
`1203`		`- )`
	`1204`	`+ RingKVCache(max_enc_len, config.enc_n_heads, config.enc_head_dim)`
`1204`	`1205`	`for _ in range(config.enc_n_layers)`
`1205`	`1206`	`]`
`1206`	`1207`	`)`