@@ -87,9 +87,10 @@ defmodule Bumblebee.Text.Gemma3Text do
8787 doc:
8888 "the standard deviation of the normal initializer used for initializing kernel parameters"
8989 ] ,
90- sliding_window : [
90+ attention_window_size : [
9191 default: 4096 ,
92- doc: "the sliding window size for local attention layers"
92+ doc:
93+ "window size for both sides of the sliding attention window (used for `:sliding_attention` layers)"
9394 ] ,
9495 layer_types: [
9596 default: nil ,
@@ -377,14 +378,14 @@ defmodule Bumblebee.Text.Gemma3Text do
377378 key_norm = & Layers . rms_norm ( & 1 , shift: 1.0 , epsilon: spec . layer_norm_epsilon , name: & 2 )
378379
379380 # Per-layer attention window size based on layer_types
380- # :sliding_attention uses local (sliding window) attention
381+ # :attention_window_size uses local (sliding window) attention
381382 # :full_attention uses global attention (nil window size)
382383 layer_types = spec . layer_types || generate_layer_types ( spec . num_blocks )
383384
384385 attention_window_size = fn idx ->
385386 case Enum . at ( layer_types , idx , :sliding_attention ) do
386387 :full_attention -> nil
387- :sliding_attention -> { spec . sliding_window , spec . sliding_window }
388+ :sliding_attention -> { spec . attention_window_size , spec . attention_window_size }
388389 end
389390 end
390391
@@ -596,7 +597,7 @@ defmodule Bumblebee.Text.Gemma3Text do
596597 { "rope_scaling" , optional ( scaling_strategy_converter ) } ,
597598 initializer_scale: { "initializer_range" , number ( ) } ,
598599 layer_norm_epsilon: { "rms_norm_eps" , number ( ) } ,
599- sliding_window : { "sliding_window" , optional ( number ( ) ) } ,
600+ attention_window_size : { "sliding_window" , optional ( number ( ) ) } ,
600601 layer_types:
601602 { "layer_types" ,
602603 list (
0 commit comments