feat!(label attention): enable label attention

meilame-tayebjee · meilame-tayebjee · commit 601fa4658f88 · 2026-01-26T16:12:57.000Z
- module and config created to do that
- mainly attached the TextEmbedder (it aggregates the token embedding to produce a sentence embedding - instead of naive averaging)
- rest of the code has been adapted, especially categorical var handling in TextClassificationModel
diff --git a/torchTextClassifiers/model/components/__init__.py b/torchTextClassifiers/model/components/__init__.py
@@ -8,5 +8,6 @@
     CategoricalVariableNet as CategoricalVariableNet,
 )
 from .classification_head import ClassificationHead as ClassificationHead
+from .text_embedder import LabelAttentionConfig as LabelAttentionConfig
 from .text_embedder import TextEmbedder as TextEmbedder
 from .text_embedder import TextEmbedderConfig as TextEmbedderConfig
diff --git a/torchTextClassifiers/model/components/text_embedder.py b/torchTextClassifiers/model/components/text_embedder.py
@@ -3,17 +3,26 @@
 from typing import Optional
 
 import torch
-from torch import nn
+import torch.nn as nn
+from torch.nn import functional as F
 
 from torchTextClassifiers.model.components.attention import AttentionConfig, Block, norm
 
 
+@dataclass
+class LabelAttentionConfig:
+    n_head: int
+    n_kv_head: int
+    num_classes: int
+
+
 @dataclass
 class TextEmbedderConfig:
     vocab_size: int
     embedding_dim: int
     padding_idx: int
     attention_config: Optional[AttentionConfig] = None
+    label_attention_config: Optional[LabelAttentionConfig] = None
 
 
 class TextEmbedder(nn.Module):
@@ -26,8 +35,9 @@ def __init__(self, text_embedder_config: TextEmbedderConfig):
         if isinstance(self.attention_config, dict):
             self.attention_config = AttentionConfig(**self.attention_config)
 
-        if self.attention_config is not None:
-            self.attention_config.n_embd = text_embedder_config.embedding_dim
+        self.enable_label_attention = text_embedder_config.label_attention_config is not None
+        if self.enable_label_attention:
+            self.label_attention_module = LabelAttentionClassifier(self.config)
 
         self.vocab_size = text_embedder_config.vocab_size
         self.embedding_dim = text_embedder_config.embedding_dim
@@ -40,6 +50,7 @@ def __init__(self, text_embedder_config: TextEmbedderConfig):
         )
 
         if self.attention_config is not None:
+            self.attention_config.n_embd = text_embedder_config.embedding_dim
             self.transformer = nn.ModuleDict(
                 {
                     "h": nn.ModuleList(
@@ -105,8 +116,23 @@ def _init_weights(self, module):
         elif isinstance(module, nn.Embedding):
             torch.nn.init.normal_(module.weight, mean=0.0, std=1.0)
 
-    def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
-        """Converts input token IDs to their corresponding embeddings."""
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        attention_mask: torch.Tensor,
+        return_label_attention_matrix: bool = False,
+    ) -> torch.Tensor:
+        """Converts input token IDs to their corresponding embeddings.
+
+        Args:
+            input_ids (torch.Tensor[Long]), shape (batch_size, seq_len): Tokenized
+            attention_mask (torch.Tensor[Long]), shape (batch_size, seq_len): Attention mask indicating non-pad tokens
+            return_label_attention_matrix (bool): Whether to return the label attention matrix
+        Returns:
+            torch.Tensor: Text embeddings, shape (batch_size, embedding_dim) if self.enable_label_attention is False, else (batch_size, num_labels, embedding_dim)
+            torch.Tensor: Label attention matrix, shape (batch_size, num_labels, seq_len) if return_label_attention_matrix is True, else None.
+                          Also None if label attention is disabled (even if return_label_attention_matrix is True)
+        """
 
         encoded_text = input_ids  # clearer name
         if encoded_text.dtype != torch.long:
@@ -138,14 +164,25 @@ def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> torc
 
             token_embeddings = norm(token_embeddings)
 
-        text_embedding = self._get_sentence_embedding(
-            token_embeddings=token_embeddings, attention_mask=attention_mask
-        )
+        text_embedding, label_attention_matrix = self._get_sentence_embedding(
+            token_embeddings=token_embeddings,
+            attention_mask=attention_mask,
+            return_label_attention_matrix=return_label_attention_matrix,
+        ).values()
 
-        return text_embedding
+        if return_label_attention_matrix:
+            return (
+                text_embedding,
+                label_attention_matrix,
+            )  # label_attention_matrix is None if label attention is disabled
+        else:
+            return text_embedding
 
     def _get_sentence_embedding(
-        self, token_embeddings: torch.Tensor, attention_mask: torch.Tensor
+        self,
+        token_embeddings: torch.Tensor,
+        attention_mask: torch.Tensor,
+        return_label_attention_matrix: bool = False,
     ) -> torch.Tensor:
         """
         Compute sentence embedding from embedded tokens - "remove" second dimension.
@@ -163,7 +200,7 @@ def _get_sentence_embedding(
         # mask pad-tokens
 
         if self.attention_config is not None:
-            if self.attention_config.aggregation_method is not None:
+            if self.attention_config.aggregation_method is not None:  # default is "mean"
                 if self.attention_config.aggregation_method == "first":
                     return token_embeddings[:, 0, :]
                 elif self.attention_config.aggregation_method == "last":
@@ -181,25 +218,29 @@ def _get_sentence_embedding(
 
         assert self.attention_config is None or self.attention_config.aggregation_method == "mean"
 
-        mask = attention_mask.unsqueeze(-1).float()  # (batch_size, seq_len, 1)
-        masked_embeddings = token_embeddings * mask  # (batch_size, seq_len, embedding_dim)
-
-        sentence_embedding = masked_embeddings.sum(dim=1) / mask.sum(dim=1).clamp(
-            min=1.0
-        )  # avoid division by zero
-
-        sentence_embedding = torch.nan_to_num(sentence_embedding, 0.0)
-
-        return sentence_embedding
-
-    def __call__(self, *args, **kwargs):
-        out = super().__call__(*args, **kwargs)
-        if out.dim() != 2:
-            raise ValueError(
-                f"Output of {self.__class__.__name__}.forward must be 2D "
-                f"(got shape {tuple(out.shape)})"
+        if self.enable_label_attention:
+            label_attention_result = self.label_attention_module(
+                token_embeddings, compute_attention_matrix=return_label_attention_matrix
             )
-        return out
+            sentence_embedding = label_attention_result[
+                "sentence_embedding"
+            ]  # (bs, n_labels, d_embed), so classifier needs to be a (d_embed, 1) matrix
+            label_attention_matrix = label_attention_result["attention_matrix"]
+
+        else:  # sentence embedding = mean of (non-pad) token embeddings
+            mask = attention_mask.unsqueeze(-1).float()  # (batch_size, seq_len, 1)
+            masked_embeddings = token_embeddings * mask  # (batch_size, seq_len, embedding_dim)
+            sentence_embedding = masked_embeddings.sum(dim=1) / mask.sum(dim=1).clamp(
+                min=1.0
+            )  # avoid division by zero
+
+            sentence_embedding = torch.nan_to_num(sentence_embedding, 0.0)
+            label_attention_matrix = None
+
+        return {
+            "sentence_embedding": sentence_embedding,
+            "label_attention_matrix": label_attention_matrix,
+        }
 
     def _precompute_rotary_embeddings(self, seq_len, head_dim, base=10000, device=None):
         # autodetect the device from model embeddings
@@ -221,3 +262,79 @@ def _precompute_rotary_embeddings(self, seq_len, head_dim, base=10000, device=No
         )  # add batch and head dims for later broadcasting
 
         return cos, sin
+
+
+class LabelAttentionClassifier(nn.Module):
+    """
+    A head for aggregating token embeddings into label-specific sentence embeddings using cross-attention mechanism.
+    Labels are queries that attend over token embeddings (keys and values) to produce label-specific embeddings.
+
+    """
+
+    def __init__(self, config: TextEmbedderConfig):
+        super().__init__()
+
+        label_attention_config = config.label_attention_config
+        self.embedding_dim = config.embedding_dim
+        self.num_classes = label_attention_config.num_classes
+        self.n_head = label_attention_config.n_head
+        self.n_kv_head = label_attention_config.n_kv_head
+        self.enable_gqa = (
+            self.n_head != self.n_kv_head
+        )  # Group Query Attention (GQA): duplicate key/value heads to match query heads if desired
+        self.head_dim = self.embedding_dim // self.n_head
+
+        self.label_embeds = nn.Embedding(self.num_classes, self.embedding_dim)
+
+        self.c_q = nn.Linear(self.embedding_dim, self.n_head * self.head_dim, bias=False)
+        self.c_k = nn.Linear(self.embedding_dim, self.n_kv_head * self.head_dim, bias=False)
+        self.c_v = nn.Linear(self.embedding_dim, self.n_kv_head * self.head_dim, bias=False)
+        self.c_proj = nn.Linear(self.embedding_dim, self.embedding_dim, bias=False)
+
+    def forward(self, token_embeddings, compute_attention_matrix: Optional[bool] = False):
+        """
+        Args:
+            token_embeddings (torch.Tensor), shape (batch, seq_len, d_model): Embedded tokens from the text input.
+            compute_attention_matrix (bool): Whether to compute and return the attention matrix.
+        Returns:
+            dict: {
+                "sentence_embedding": torch.Tensor, shape (batch, num_classes, d_model): Label-specific sentence embeddings.
+                "attention_matrix": Optional[torch.Tensor], shape (batch, n_head, num_classes, seq_len): Attention weights if compute_attention_matrix is True, else None.
+            }
+
+        """
+        B, T, C = token_embeddings.size()
+
+        # 1. Create label indices [0, 1, ..., C-1] for the whole batch
+        label_indices = torch.arange(self.num_classes).expand(B, -1)
+
+        all_label_embeddings = self.label_embeds(
+            label_indices
+        )  # Shape: [batch, num_classes, d_model]
+        all_label_embeddings = norm(all_label_embeddings)
+
+        q = self.c_q(all_label_embeddings).view(B, self.num_classes, self.n_head, self.head_dim)
+        k = self.c_k(token_embeddings).view(B, T, self.n_kv_head, self.head_dim)
+        v = self.c_v(token_embeddings).view(B, T, self.n_kv_head, self.head_dim)
+
+        q, k = norm(q), norm(k)  # QK norm
+        q, k, v = (
+            q.transpose(1, 2),
+            k.transpose(1, 2),
+            v.transpose(1, 2),
+        )  # make head be batch dim, i.e. (B, T, H, D) -> (B, H, T, D)
+
+        y = F.scaled_dot_product_attention(q, k, v, is_causal=False, enable_gqa=self.enable_gqa)
+
+        # Re-assemble the heads side by side and project back to residual stream
+        y = y.transpose(1, 2).contiguous().view(B, self.num_classes, -1)  # (bs, n_labels, d_model)
+        y = self.c_proj(y)
+
+        attention_matrix = None
+        if compute_attention_matrix:
+            # size (B, n_head, n_labels, seq_len) - we let the user handle aggregation over heads if desired
+            attention_matrix = torch.softmax(
+                torch.matmul(q, k.transpose(-2, -1)) / (self.head_dim**0.5), dim=-1
+            )
+
+        return {"sentence_embedding": y, "attention_matrix": attention_matrix}
diff --git a/torchTextClassifiers/model/lightning.py b/torchTextClassifiers/model/lightning.py
@@ -102,6 +102,7 @@ def validation_step(self, batch, batch_idx: int):
         targets = batch["labels"]
 
         outputs = self.forward(batch)
+
         loss = self.loss(outputs, targets)
         self.log("val_loss", loss, on_epoch=True, on_step=False, prog_bar=True, sync_dist=True)
 
diff --git a/torchTextClassifiers/model/model.py b/torchTextClassifiers/model/model.py
@@ -1,7 +1,7 @@
-"""FastText model components.
+"""TextClassification model components.
 
 This module contains the PyTorch model, Lightning module, and dataset classes
-for FastText classification. Consolidates what was previously in pytorch_model.py,
+for TextClassification classification. Consolidates what was previously in pytorch_model.py,
 lightning_module.py, and dataset.py.
 """
 
@@ -17,6 +17,7 @@
     ClassificationHead,
     TextEmbedder,
 )
+from torchTextClassifiers.model.components.attention import norm
 
 logger = logging.getLogger(__name__)
 
@@ -67,8 +68,6 @@ def __init__(
 
         self._validate_component_connections()
 
-        self.num_classes = self.classification_head.num_classes
-
         torch.nn.init.zeros_(self.classification_head.net.weight)
         if self.text_embedder is not None:
             self.text_embedder.init_weights()
@@ -98,6 +97,17 @@ def _check_text_categorical_connection(self, text_embedder, cat_var_net):
                 raise ValueError(
                     "Classification head input dimension does not match expected dimension from text embedder and categorical variable net."
                 )
+            if self.text_embedder.enable_label_attention:
+                self.enable_label_attention = True
+                if self.classification_head.num_classes != 1:
+                    raise ValueError(
+                        "Label attention is enabled. TextEmbedder outputs a (num_classes, embedding_dim) tensor, so the ClassificationHead should have an output dimension of 1."
+                    )
+                # if enable_label_attention is True, label_attention_config exists - and contains num_classes necessarily
+                self.num_classes = self.text_embedder.config.label_attention_config.num_classes
+            else:
+                self.enable_label_attention = False
+                self.num_classes = self.classification_head.num_classes
         else:
             logger.warning(
                 "⚠️ No text embedder provided; assuming input text is already embedded or vectorized. Take care that the classification head input dimension matches the input text dimension."
@@ -131,21 +141,29 @@ def forward(
         if self.categorical_variable_net:
             x_cat = self.categorical_variable_net(categorical_vars)
 
+            if self.enable_label_attention:
+                # x_text is (batch_size, num_classes, embedding_dim)
+                # x_cat is (batch_size, cat_embedding_dim)
+                # We need to expand x_cat to (batch_size, num_classes, cat_embedding_dim)
+                # x_cat will be appended to x_text along the last dimension for each class
+                x_cat = x_cat.unsqueeze(1).expand(-1, self.num_classes, -1)
+
             if (
                 self.categorical_variable_net.forward_type
                 == CategoricalForwardType.AVERAGE_AND_CONCAT
                 or self.categorical_variable_net.forward_type
                 == CategoricalForwardType.CONCATENATE_ALL
             ):
-                x_combined = torch.cat((x_text, x_cat), dim=1)
+                x_combined = torch.cat((x_text, x_cat), dim=-1)
             else:
                 assert (
                     self.categorical_variable_net.forward_type == CategoricalForwardType.SUM_TO_TEXT
                 )
+
                 x_combined = x_text + x_cat
         else:
             x_combined = x_text
 
-        logits = self.classification_head(x_combined)
+        logits = self.classification_head(norm(x_combined)).squeeze(-1)
 
         return logits
diff --git a/torchTextClassifiers/torchTextClassifiers.py b/torchTextClassifiers/torchTextClassifiers.py
@@ -29,6 +29,7 @@
     CategoricalForwardType,
     CategoricalVariableNet,
     ClassificationHead,
+    LabelAttentionConfig,
     TextEmbedder,
     TextEmbedderConfig,
 )
@@ -53,6 +54,7 @@ class ModelConfig:
     categorical_embedding_dims: Optional[Union[List[int], int]] = None
     num_classes: Optional[int] = None
     attention_config: Optional[AttentionConfig] = None
+    label_attention_config: Optional[LabelAttentionConfig] = None
 
     def to_dict(self) -> Dict[str, Any]:
         return asdict(self)
@@ -140,6 +142,7 @@ def __init__(
         self.embedding_dim = model_config.embedding_dim
         self.categorical_vocabulary_sizes = model_config.categorical_vocabulary_sizes
         self.num_classes = model_config.num_classes
+        self.enable_label_attention = model_config.label_attention_config is not None
 
         if self.tokenizer.output_vectorized:
             self.text_embedder = None
@@ -153,6 +156,7 @@ def __init__(
                 embedding_dim=self.embedding_dim,
                 padding_idx=tokenizer.padding_idx,
                 attention_config=model_config.attention_config,
+                label_attention_config=model_config.label_attention_config,
             )
             self.text_embedder = TextEmbedder(
                 text_embedder_config=text_embedder_config,
@@ -174,7 +178,9 @@ def __init__(
 
         self.classification_head = ClassificationHead(
             input_dim=classif_head_input_dim,
-            num_classes=model_config.num_classes,
+            num_classes=1
+            if self.enable_label_attention
+            else model_config.num_classes,  # output dim is 1 when using label attention, because embeddings are (num_classes, embedding_dim)
         )
 
         self.pytorch_model = TextClassificationModel(

Original file line number	Diff line number	Diff line change
`@@ -8,5 +8,6 @@`
`8`	`8`	`CategoricalVariableNet as CategoricalVariableNet,`
`9`	`9`	`)`
`10`	`10`	`from .classification_head import ClassificationHead as ClassificationHead`
	`11`	`+from .text_embedder import LabelAttentionConfig as LabelAttentionConfig`
`11`	`12`	`from .text_embedder import TextEmbedder as TextEmbedder`
`12`	`13`	`from .text_embedder import TextEmbedderConfig as TextEmbedderConfig`