feat(explainability): add new expl. pipe. with label attention

meilame-tayebjee · meilame-tayebjee · commit a2fe33e1a8be · 2026-01-26T17:15:47.000Z
- given a parameter, retrieve the attention matrix
- compatible with captum attributions
- update tests accordingly
diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py
@@ -189,13 +189,19 @@ def run_full_pipeline(
 
     # Predict with explanations
     top_k = 5
-    predictions = ttc.predict(X, top_k=top_k, explain=True)
+
+    predictions = ttc.predict(
+        X,
+        top_k=top_k,
+        explain_with_label_attention=label_attention_enabled,
+        explain_with_captum=True,
+    )
 
     # Test explainability functions
     text_idx = 0
     text = sample_text_data[text_idx]
     offsets = predictions["offset_mapping"][text_idx]
-    attributions = predictions["attributions"][text_idx]
+    attributions = predictions["captum_attributions"][text_idx]
     word_ids = predictions["word_ids"][text_idx]
 
     words, word_attributions = map_attributions_to_word(attributions, text, word_ids, offsets)
diff --git a/torchTextClassifiers/model/components/text_embedder.py b/torchTextClassifiers/model/components/text_embedder.py
@@ -170,13 +170,10 @@ def forward(
             return_label_attention_matrix=return_label_attention_matrix,
         ).values()
 
-        if return_label_attention_matrix:
-            return (
-                text_embedding,
-                label_attention_matrix,
-            )  # label_attention_matrix is None if label attention is disabled
-        else:
-            return text_embedding
+        return {
+            "sentence_embedding": text_embedding,
+            "label_attention_matrix": label_attention_matrix,
+        }
 
     def _get_sentence_embedding(
         self,
@@ -304,6 +301,9 @@ def forward(self, token_embeddings, compute_attention_matrix: Optional[bool] = F
 
         """
         B, T, C = token_embeddings.size()
+        if isinstance(compute_attention_matrix, torch.Tensor):
+            compute_attention_matrix = compute_attention_matrix[0].item()
+        compute_attention_matrix = bool(compute_attention_matrix)
 
         # 1. Create label indices [0, 1, ..., C-1] for the whole batch
         label_indices = torch.arange(self.num_classes).expand(B, -1)
diff --git a/torchTextClassifiers/model/model.py b/torchTextClassifiers/model/model.py
@@ -118,6 +118,7 @@ def forward(
         input_ids: Annotated[torch.Tensor, "batch seq_len"],
         attention_mask: Annotated[torch.Tensor, "batch seq_len"],
         categorical_vars: Annotated[torch.Tensor, "batch num_cats"],
+        return_label_attention_matrix: bool = False,
         **kwargs,
     ) -> torch.Tensor:
         """
@@ -136,7 +137,16 @@ def forward(
         if self.text_embedder is None:
             x_text = encoded_text.float()
         else:
-            x_text = self.text_embedder(input_ids=encoded_text, attention_mask=attention_mask)
+            text_embed_output = self.text_embedder(
+                input_ids=encoded_text,
+                attention_mask=attention_mask,
+                return_label_attention_matrix=return_label_attention_matrix,
+            )
+            x_text = text_embed_output["sentence_embedding"]
+            if isinstance(return_label_attention_matrix, torch.Tensor):
+                return_label_attention_matrix = return_label_attention_matrix[0].item()
+            if return_label_attention_matrix:
+                label_attention_matrix = text_embed_output["label_attention_matrix"]
 
         if self.categorical_variable_net:
             x_cat = self.categorical_variable_net(categorical_vars)
@@ -166,4 +176,7 @@ def forward(
 
         logits = self.classification_head(norm(x_combined)).squeeze(-1)
 
+        if return_label_attention_matrix:
+            return {"logits": logits, "label_attention_matrix": label_attention_matrix}
+
         return logits
diff --git a/torchTextClassifiers/torchTextClassifiers.py b/torchTextClassifiers/torchTextClassifiers.py
@@ -492,13 +492,15 @@ def predict(
         self,
         X_test: np.ndarray,
         top_k=1,
-        explain=False,
+        explain_with_label_attention: bool = False,
+        explain_with_captum=False,
     ):
         """
         Args:
             X_test (np.ndarray): input data to predict on, shape (N,d) where the first column is text and the rest are categorical variables
             top_k (int): for each sentence, return the top_k most likely predictions (default: 1)
-            explain (bool): launch gradient integration to have an explanation of the prediction (default: False)
+            explain_with_label_attention (bool): if enabled, use attention matrix labels x tokens to have an explanation of the prediction (default: False)
+            explain_with_captum (bool): launch gradient integration with Captum for explanation (default: False)
 
         Returns: A dictionary containing the following fields:
                 - predictions (torch.Tensor, shape (len(text), top_k)): A tensor containing the top_k most likely codes to the query.
@@ -507,6 +509,7 @@ def predict(
                     - attributions (torch.Tensor, shape (len(text), top_k, seq_len)): A tensor containing the attributions for each token in the text.
         """
 
+        explain = explain_with_label_attention or explain_with_captum
         if explain:
             return_offsets_mapping = True  # to be passed to the tokenizer
             return_word_ids = True
@@ -515,13 +518,19 @@ def predict(
                     "Explainability is not supported when the tokenizer outputs vectorized text directly. Please use a tokenizer that outputs token IDs."
                 )
             else:
-                if not HAS_CAPTUM:
-                    raise ImportError(
-                        "Captum is not installed and is required for explainability. Run 'pip install/uv add torchFastText[explainability]'."
-                    )
-                lig = LayerIntegratedGradients(
-                    self.pytorch_model, self.pytorch_model.text_embedder.embedding_layer
-                )  # initialize a Captum layer gradient integrator
+                if explain_with_captum:
+                    if not HAS_CAPTUM:
+                        raise ImportError(
+                            "Captum is not installed and is required for explainability. Run 'pip install/uv add torchFastText[explainability]'."
+                        )
+                    lig = LayerIntegratedGradients(
+                        self.pytorch_model, self.pytorch_model.text_embedder.embedding_layer
+                    )  # initialize a Captum layer gradient integrator
+                if explain_with_label_attention:
+                    if not self.enable_label_attention:
+                        raise RuntimeError(
+                            "Label attention explainability is enabled, but the model was not configured with label attention. Please enable label attention in the model configuration during initialization and retrain."
+                        )
         else:
             return_offsets_mapping = False
             return_word_ids = False
@@ -553,9 +562,19 @@ def predict(
         else:
             categorical_vars = torch.empty((encoded_text.shape[0], 0), dtype=torch.float32)
 
-        pred = self.pytorch_model(
-            encoded_text, attention_mask, categorical_vars
+        model_output = self.pytorch_model(
+            encoded_text,
+            attention_mask,
+            categorical_vars,
+            return_label_attention_matrix=explain_with_label_attention,
         )  # forward pass, contains the prediction scores (len(text), num_classes)
+        pred = (
+            model_output["logits"] if explain_with_label_attention else model_output
+        )  # (batch_size, num_classes)
+
+        label_attention_matrix = (
+            model_output["label_attention_matrix"] if explain_with_label_attention else None
+        )
 
         label_scores = pred.detach().cpu().softmax(dim=1)  # convert to probabilities
 
@@ -565,21 +584,28 @@ def predict(
         confidence = torch.round(label_scores_topk.values, decimals=2)  # and their scores
 
         if explain:
-            all_attributions = []
-            for k in range(top_k):
-                attributions = lig.attribute(
-                    (encoded_text, attention_mask, categorical_vars),
-                    target=torch.Tensor(predictions[:, k]).long(),
-                )  # (batch_size, seq_len)
-                attributions = attributions.sum(dim=-1)
-                all_attributions.append(attributions.detach().cpu())
-
-            all_attributions = torch.stack(all_attributions, dim=1)  # (batch_size, top_k, seq_len)
+            if explain_with_captum:
+                # Captum explanations
+                captum_attributions = []
+                for k in range(top_k):
+                    attributions = lig.attribute(
+                        (encoded_text, attention_mask, categorical_vars),
+                        target=torch.Tensor(predictions[:, k]).long(),
+                    )  # (batch_size, seq_len)
+                    attributions = attributions.sum(dim=-1)
+                    captum_attributions.append(attributions.detach().cpu())
+
+                captum_attributions = torch.stack(
+                    captum_attributions, dim=1
+                )  # (batch_size, top_k, seq_len)
+            else:
+                captum_attributions = None
 
             return {
                 "prediction": predictions,
                 "confidence": confidence,
-                "attributions": all_attributions,
+                "captum_attributions": captum_attributions,
+                "label_attention_attributions": label_attention_matrix,
                 "offset_mapping": tokenize_output.offset_mapping,
                 "word_ids": tokenize_output.word_ids,
             }