huggingface · michaelfeil · Dec 21, 2025 · Dec 21, 2025
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -48,6 +48,7 @@ candle = { version = "0.8", package = "candle-core" }
 candle-nn = { version = "0.8" }
 candle-transformers = { version = "0.8" }
 candle-flash-attn = { version = "0.8" }
+candle-flash-attn-3 = { version = "0.0.1", features = ["cuda-11"], default-features = false }
 candle-cublaslt = { version = "0.0.1" }
 candle-layer-norm = { version = "0.0.1" }
 candle-index-select-cu =  { version = "0.0.1", features = ["cuda-11"], default-features = false }

diff --git a/backends/Cargo.toml b/backends/Cargo.toml
@@ -29,3 +29,4 @@ mkl = ["text-embeddings-backend-candle?/mkl"]
 accelerate = ["text-embeddings-backend-candle?/accelerate"]
 flash-attn = ["text-embeddings-backend-candle?/flash-attn"]
 flash-attn-v1 = ["text-embeddings-backend-candle?/flash-attn-v1"]
+flash-attn-v3 = ["text-embeddings-backend-candle?/flash-attn-v3"]
diff --git a/backends/candle/Cargo.toml b/backends/candle/Cargo.toml
@@ -14,6 +14,7 @@ candle-nn = { workspace = true }
 candle-transformers = { workspace = true }
 candle-flash-attn = { workspace = true, optional = true}
 candle-flash-attn-v1 = { workspace = true, optional = true }
+candle-flash-attn-3 = { workspace = true, optional = true }
 candle-cublaslt = { workspace = true, optional = true }
 candle-index-select-cu = { workspace = true, optional = true, features = ["cuda-11"], default-features = false}
 candle-layer-norm = { workspace = true, optional = true }
@@ -45,3 +46,4 @@ mkl = ["dep:intel-mkl-src", "candle/_mkl"]
 cuda = ["candle/_cuda", "candle-nn/_cuda", "dep:candle-cublaslt", "dep:candle-layer-norm", "dep:candle-rotary", "dep:candle-index-select-cu"]
 flash-attn-v1 = ["dep:candle-flash-attn-v1", "cuda"]
 flash-attn = ["dep:candle-flash-attn", "cuda"]
+flash-attn-v3 = ["dep:candle-flash-attn-3", "cuda"]
diff --git a/backends/candle/src/flash_attn.rs b/backends/candle/src/flash_attn.rs
@@ -19,6 +19,14 @@ pub fn get_runtime_compute_cap() -> usize {
     }
 }
 
+static PRINT_ONCE: Once = Once::new();
+
+fn print_version_once(version: &str) {
+    PRINT_ONCE.call_once(|| {
+        println!("Using michaelfeil/candle-flash-attn-3 v{}", version);
+    });
+}
+
 #[allow(clippy::too_many_arguments, unused)]
 pub(crate) fn flash_attn_varlen(
     q: &Tensor,
@@ -62,7 +70,53 @@ pub(crate) fn flash_attn_varlen(
         #[cfg(not(feature = "flash-attn-v1"))]
         candle::bail!("Flash attention v1 is not installed. Use `flash-attn-v1` feature.")
     } else if (80..90).contains(&runtime_compute_cap) || runtime_compute_cap == 90 {
-        #[cfg(feature = "flash-attn")]
+        #[cfg(feature = "flash-attn-v3")]
+        {
+            use candle_flash_attn_v3::{flash_attn_varlen_alibi_windowed, flash_attn_varlen_windowed};
+            print_version_once("0.0.1");
+            let window_size_right = if causal {
+                Some(0)
+            } else if window_size_right.is_some() {
+                window_size_right
+            } else {
+                None
+            };
+
+            let attention = if let Some(alibi_slopes) = alibi_slopes {
+                flash_attn_varlen_alibi_windowed(
+                    q,
+                    k,
+                    v,
+                    alibi_slopes,
+                    seqlens_q,
+                    seqlens_k,
+                    max_seqlen_q,
+                    max_seqlen_k,
+                    softmax_scale,
+                    window_size_left,
+                    window_size_right,
+                    false, // use_gqa_packing - set to false for now
+                )
+            } else {
+                flash_attn_varlen_windowed(
+                    q,
+                    k,
+                    v,
+                    seqlens_q,
+                    seqlens_k,
+                    max_seqlen_q,
+                    max_seqlen_k,
+                    softmax_scale,
+                    window_size_left,
+                    window_size_right,
+                    false, // use_gqa_packing - set to false for now
+                )
+            };
+
+            return attention;
+        }
+
+        #[cfg(all(not(feature = "flash-attn-v3"), feature = "flash-attn"))]
         {
             use candle_flash_attn::{flash_attn_varlen_alibi_windowed, flash_attn_varlen_windowed};
 
@@ -105,8 +159,8 @@ pub(crate) fn flash_attn_varlen(
 
             return attention;
         }
-        #[cfg(not(feature = "flash-attn"))]
-        candle::bail!("Flash attention is not installed. Use `flash-attn` feature.")
+        #[cfg(not(any(feature = "flash-attn-v3", feature = "flash-attn")))]
+        candle::bail!("Flash attention is not installed. Use `flash-attn-v3` or `flash-attn` feature.")
     }
     candle::bail!(
         "GPU with CUDA capability {} is not supported",