Mf/add-support-for-llama-3-and-nemotron (#805)

michaelfeil · alvarobartt · web-flow · commit 882d02789883 · 2026-02-12T18:06:55.000+09:00
Co-authored-by: Alvaro Bartolome &lt;36760800+alvarobartt@users.noreply.github.com&gt;
diff --git a/backends/candle/src/layers/rotary.rs b/backends/candle/src/layers/rotary.rs
@@ -2,14 +2,21 @@ use candle::{DType, Device, Result, Tensor, D};
 use serde::Deserialize;
 
 #[derive(Debug, Clone, PartialEq, Deserialize)]
-pub struct NTKScaling {
-    pub factor: f32,
-}
-
-#[derive(Debug, Clone, PartialEq, Deserialize)]
-#[serde(tag = "type", rename_all = "kebab-case")]
+#[serde(untagged)]
 pub enum RopeScaling {
-    Ntk(NTKScaling),
+    Llama3 {
+        #[serde(alias = "type")]
+        rope_type: String,
+        factor: f32,
+        high_freq_factor: f32,
+        low_freq_factor: f32,
+        original_max_position_embeddings: usize,
+    },
+    Ntk {
+        #[serde(alias = "type")]
+        rope_type: String,
+        factor: f32,
+    },
 }
 
 pub fn get_inv_freqs(
@@ -29,9 +36,52 @@ pub fn get_inv_freqs(
 
     if let Some(rope_scaling) = rope_scaling {
         match rope_scaling {
-            RopeScaling::Ntk(ntk_scaling) => {
-                let inv_freqs = get_inv_freqs_inner(dim, base * ntk_scaling.factor, device)?;
-                let s = ntk_scaling.factor.powf(2.0 / dim as f32) as f64;
+            RopeScaling::Llama3 {
+                rope_type: _,
+                factor,
+                high_freq_factor,
+                low_freq_factor,
+                original_max_position_embeddings,
+            } => {
+                let old_context_len = *original_max_position_embeddings as f32;
+                let low_freq_wavelen = old_context_len / low_freq_factor;
+                let high_freq_wavelen = old_context_len / high_freq_factor;
+
+                let inv_freq: Vec<_> = (0..dim)
+                    .step_by(2)
+                    .map(|i| {
+                        let freq_idx = i as f32 / dim as f32;
+                        // Compute base inverse frequency
+                        let inv_freq_base = 1.0 / base.powf(freq_idx);
+
+                        // Compute wavelength from inverse frequency
+                        let wavelen = 2.0 * std::f32::consts::PI / inv_freq_base;
+
+                        // Apply Llama3 scaling logic
+                        if wavelen < high_freq_wavelen {
+                            // High frequency: no scaling
+                            inv_freq_base
+                        } else if wavelen > low_freq_wavelen {
+                            // Low frequency: scale by factor
+                            inv_freq_base / factor
+                        } else {
+                            // Medium frequency: smooth interpolation
+                            let smooth_factor = (old_context_len / wavelen - low_freq_factor)
+                                / (high_freq_factor - low_freq_factor);
+                            let inv_freq_llama = inv_freq_base / factor;
+                            (1.0 - smooth_factor) * inv_freq_llama + smooth_factor * inv_freq_base
+                        }
+                    })
+                    .collect();
+                let inv_freq_len = inv_freq.len();
+                return Tensor::from_vec(inv_freq, (1, inv_freq_len), device);
+            }
+            RopeScaling::Ntk {
+                rope_type: _,
+                factor,
+            } => {
+                let inv_freqs = get_inv_freqs_inner(dim, base * factor, device)?;
+                let s = factor.powf(2.0 / dim as f32) as f64;
                 return inv_freqs / s;
             }
         }
diff --git a/backends/candle/src/lib.rs b/backends/candle/src/lib.rs
@@ -298,7 +298,7 @@ impl CandleBackend {
                 tracing::info!("Starting MPNet model on {:?}", device);
                 Ok(Box::new(MPNetModel::load(vb, &config, model_type).s()?))
             }
-            (Config::Llama(_config), Device::Cpu | Device::Metal(_)) => Err(BackendError::Start(
+            (Config::Llama(_), Device::Cpu | Device::Metal(_)) => Err(BackendError::Start(
                 "Llama is only supported on Cuda devices in fp16 with flash attention enabled"
                     .to_string(),
             )),
@@ -531,8 +531,7 @@ impl CandleBackend {
             #[cfg(feature = "cuda")]
             (Config::Llama(config), Device::Cuda(_)) => {
                 match config.rope_scaling {
-                    Some(ref _rope_scaling) => {
-                        // error, as no rope scaling is supported for FlashLlama yet
+                    Some(_) => {
                         Err(BackendError::Start(
                             "Rope scaling is not supported for FlashLlama yet".to_string(),
                         ))
diff --git a/backends/candle/src/models/flash_mistral.rs b/backends/candle/src/models/flash_mistral.rs
@@ -11,6 +11,7 @@ struct MistralAttention {
     o_proj: Linear,
 
     window_size_left: Option<usize>,
+    use_bidirectional_attention: bool,
 
     num_attention_heads: usize,
     num_key_value_heads: usize,
@@ -24,6 +25,7 @@ struct MistralAttention {
 impl MistralAttention {
     pub fn load(vb: VarBuilder, config: &MistralConfig) -> Result<Self> {
         let window_size_left = config.sliding_window;
+        let use_bidirectional_attention = config.use_bidirectional_attention;
         let num_attention_heads = config.num_attention_heads;
         let attention_head_size = config.hidden_size / config.num_attention_heads;
         let num_key_value_heads = config.num_key_value_heads;
@@ -54,6 +56,7 @@ impl MistralAttention {
             qkv_linear,
             o_proj,
             window_size_left,
+            use_bidirectional_attention,
             num_attention_heads,
             num_key_value_heads,
             attention_head_size,
@@ -103,7 +106,7 @@ impl MistralAttention {
             max_s,
             max_s,
             self.softmax_scale,
-            true,
+            !self.use_bidirectional_attention,
             self.window_size_left,
             None,
         )?;
@@ -269,7 +272,7 @@ impl FlashMistralModel {
             layers[0].attention.attention_head_size,
             config.rope_theta,
             vb.device(),
-            None,
+            config.rope_scaling.as_ref(),
         )?;
         let (cos_cache, sin_cache) = get_cos_sin(
             config.max_position_embeddings,
diff --git a/backends/candle/src/models/mistral.rs b/backends/candle/src/models/mistral.rs
@@ -1,4 +1,4 @@
-use crate::layers::HiddenAct;
+use crate::layers::{HiddenAct, RopeScaling};
 use serde::Deserialize;
 
 #[derive(Debug, Clone, PartialEq, Deserialize)]
@@ -16,4 +16,7 @@ pub struct MistralConfig {
     pub model_type: Option<String>,
     pub rope_theta: f32,
     pub sliding_window: Option<usize>,
+    pub rope_scaling: Option<RopeScaling>,
+    #[serde(default)]
+    pub use_bidirectional_attention: bool,
 }