Add todos for input proc refactoring

alex-jw-brooks · alex-jw-brooks · commit 402dbf067da2 · 2025-04-11T09:03:27.000Z
diff --git a/src/transformers/models/granite_speech/feature_extraction_granite_speech.py b/src/transformers/models/granite_speech/feature_extraction_granite_speech.py
@@ -74,9 +74,15 @@ def __call__(
         )
         audio_embed_sizes = self._get_num_audio_features(audio_lengths)
         speech_inputs["audio_embed_sizes"] = audio_embed_sizes
-        # TODO: input_features_mask is not a great name, because
-        # input_features and input_features_mask have different shapes
-        # (before/after the projector)
+        # TODO (@alex-jw-brooks): Currently input_features_mask is not
+        # a great name, because input_features and input_features_mask
+        # have different shapes (before/after the projector).
+        #
+        # We should align this with other multimodal models, e.g,. llava
+        # and qwen2audio and refactor this to ensure input_feature_mask
+        # has the same dimensionality as input_features, or compute it in
+        # the model based on the audio embedding sizes (since we do not
+        # have an attention mask for the audio features to infer padding from).
         speech_inputs["input_features_mask"] = torch.arange(max(audio_embed_sizes)).view(1, -1) < torch.tensor(
             audio_embed_sizes
         ).view(-1, 1)
diff --git a/src/transformers/models/granite_speech/processing_granite_speech.py b/src/transformers/models/granite_speech/processing_granite_speech.py
@@ -62,6 +62,10 @@ def __call__(
             # trigger the conditions due to the way they call multimodal
             # processors, e.g., vLLM.
             audio_inputs = self.audio_processor(audio, device=device)
+
+            # TODO (@alex-jw-brooks); we should add a util to get_num_audio_tokens
+            # from feature lengths and call it here, rather than returning it
+            # from the feature extractor.
             audio_embed_sizes = audio_inputs.pop("audio_embed_sizes")
 
             # Expand the audio placeholders to match the feature dims; this