reslove code review items:

DannyYuyang-quic · DannyYuyang-quic · commit ac51a366b90f · 2026-04-21T22:43:22.000+08:00
- Revert the attention_sink assertion for multimodal models
 - Parameterize the audio encoder input shape
 - Fix comment typos
diff --git a/examples/qualcomm/oss_scripts/llama/dataset.py b/examples/qualcomm/oss_scripts/llama/dataset.py
@@ -90,6 +90,19 @@ def _build_audio_dataset(
                 wav, sr = soundfile.read(audio_path, always_2d=False)
             wav = torch.from_numpy(wav).float().unsqueeze(0)  # [1, T]
 
+            # Pad to fixed length so input_features has shape [1, n_bins, input_dim]
+            hop_length = processor.audio_processor.melspec_kwargs["hop_length"]
+            target_raw_length = (config.n_bins * 2 - 1) * hop_length
+            pad_size = target_raw_length - wav.shape[-1]
+            if pad_size > 0:
+                wav = torch.nn.functional.pad(wav, (0, pad_size))
+            elif pad_size < 0:
+                suggested_n_bins = (wav.shape[-1] // hop_length + 1) // 2
+                raise ValueError(
+                    f"Audio length ({wav.shape[-1]} samples) exceeds target ({target_raw_length} samples) "
+                    f"derived from n_bins={config.n_bins}. Set n_bins >= {suggested_n_bins} in the config to avoid information loss."
+                )
+
             # Process audio with text prompt using HuggingFace processor
             input_features = processor(prompt, wav, return_tensors="pt").input_features
             dataset.append((input_features,))
diff --git a/examples/qualcomm/oss_scripts/llama/encoder/encoder_config.py b/examples/qualcomm/oss_scripts/llama/encoder/encoder_config.py
@@ -53,10 +53,11 @@ class AudioModalityConfig(MultiModalityConfig):
     """
 
     audio_seq_len: int
+    n_bins: int
     audio_url: str
 
     def create_encoder(self, config):
-        return self.encoder_class(config)
+        return self.encoder_class(config, n_bins=self.n_bins)
 
 
 @dataclass(init=False, frozen=True)
@@ -92,6 +93,7 @@ class GraniteSpeechEncoder(AudioModalityConfig):
 
     encoder_class = GraniteSpeechCTCEncoderWrapper
     audio_seq_len = 171
+    n_bins = 844
     audio_url = "https://huggingface.co/ibm-granite/granite-speech-3.3-2b/resolve/main/10226_10111_000000.wav?download=true"
     quant_recipe = GraniteSpeechEncoderQuantRecipe
     num_sharding = 8
diff --git a/examples/qualcomm/oss_scripts/llama/llama.py b/examples/qualcomm/oss_scripts/llama/llama.py
@@ -635,7 +635,7 @@ def export_llama(args) -> None:
     )
     # TODO: Implement attention sink support for multimodal models (vision/audio).
     assert (
-        is_multimodal or args.use_attention_sink is None
+        not is_multimodal or args.use_attention_sink is None
     ), "Multimodal models currently do not support attention sink feature."
 
     if args.pre_gen_pte:
diff --git a/examples/qualcomm/oss_scripts/llama/model/audio_encoder.py b/examples/qualcomm/oss_scripts/llama/model/audio_encoder.py
@@ -123,7 +123,7 @@ def forward(
 
 
 class GraniteSpeechCTCEncoderWrapper(nn.Module):
-    def __init__(self, config: GraniteSpeechConfig):
+    def __init__(self, config: GraniteSpeechConfig, n_bins: int):
         super().__init__()
         self.encoder = GraniteSpeechCTCEncoder(config.encoder_config)
         self.projector = GraniteSpeechEncoderProjector(config)
@@ -145,9 +145,11 @@ def __init__(self, config: GraniteSpeechConfig):
         )
 
         self.config = config
+        self.n_bins = n_bins
 
     def get_example_inputs(self):
-        return (torch.randn((1, 844, 160), dtype=torch.float32),)
+        input_dim = self.config.encoder_config.input_dim
+        return (torch.randn((1, self.n_bins, input_dim), dtype=torch.float32),)
 
     def forward(self, hidden_states: torch.Tensor):
         encoder_embeds = self.encoder(hidden_states)
diff --git a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_embedding_merger.h b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_embedding_merger.h
@@ -45,7 +45,7 @@ class MultimodalEmbeddingMerger {
   int32_t embedding_dim_;
   int32_t total_tokens_{0};
 
-  // merged embeddings are holded in this vector.
+  // merged embeddings are held in this vector.
   std::vector<float> embeddings_;
   std::array<executorch::aten::TensorImpl::SizesType, 3> sizes_{};
 };
diff --git a/examples/qualcomm/oss_scripts/llama/tokenizer.py b/examples/qualcomm/oss_scripts/llama/tokenizer.py
@@ -184,8 +184,8 @@ def prepare_messages(self, prompts: List[str]):  # noqa: C901
 
         audio_paths = self.control_args.audio_path
         if hasattr(self.config, AUDIO_ENCODER):
-            # Load image from user-specified path (URL or local file)
-            # fall back to the default image URL if no image is provided.
+            # Load audio from user-specified path (URL or local file)
+            # fall back to the default audio URL if no audio is provided.
             if not audio_paths:
                 audio_paths = [getattr(self.config, AUDIO_ENCODER).audio_url]
                 warnings.warn(

Original file line number	Diff line number	Diff line change
`@@ -635,7 +635,7 @@ def export_llama(args) -> None:`
`635`	`635`	`)`
`636`	`636`	`# TODO: Implement attention sink support for multimodal models (vision/audio).`
`637`	`637`	`assert (`
`638`		`- is_multimodal or args.use_attention_sink is None`
	`638`	`+ not is_multimodal or args.use_attention_sink is None`
`639`	`639`	`), "Multimodal models currently do not support attention sink feature."`
`640`	`640`
`641`	`641`	`if args.pre_gen_pte:`