feat(ngram): add return offsets and word_ids + fix output_dim

meilame-tayebjee · meilame-tayebjee · commit 3c0a85aac71b · 2025-11-20T10:59:42.000Z
diff --git a/torchTextClassifiers/tokenizers/__init__.py b/torchTextClassifiers/tokenizers/__init__.py
@@ -6,4 +6,5 @@
     HuggingFaceTokenizer as HuggingFaceTokenizer,
 )
 from .base import TokenizerOutput as TokenizerOutput
+from .ngram import NGramTokenizer as NGramTokenizer
 from .WordPiece import WordPieceTokenizer as WordPieceTokenizer
diff --git a/torchTextClassifiers/tokenizers/base.py b/torchTextClassifiers/tokenizers/base.py
@@ -65,7 +65,11 @@ def __post_init__(self):
 
 class BaseTokenizer(ABC):
     def __init__(
-        self, vocab_size: int, output_vectorized: bool = False, output_dim: Optional[int] = None
+        self,
+        vocab_size: int,
+        padding_idx: int,
+        output_vectorized: bool = False,
+        output_dim: Optional[int] = None,
     ):
         """
         Base class for tokenizers.
@@ -78,6 +82,7 @@ def __init__(
         self.vocab_size = vocab_size
         self.output_vectorized = output_vectorized
         self.output_dim = output_dim
+        self.padding_idx = padding_idx
         if self.output_vectorized:
             if output_dim is None:
                 raise ValueError(
diff --git a/torchTextClassifiers/tokenizers/ngram.py b/torchTextClassifiers/tokenizers/ngram.py
@@ -4,6 +4,7 @@
 from functools import lru_cache
 from typing import List, Optional, Tuple, Union
 
+import numpy as np
 import torch
 
 from torchTextClassifiers.tokenizers import BaseTokenizer, TokenizerOutput
@@ -113,7 +114,7 @@ def get(self, word: str) -> List[int]:
 
 
 # ============================================================================
-#                   Vectorized encoding
+#                   Vectorized encoding with optional metadata
 # ============================================================================
 
 
@@ -124,33 +125,78 @@ def encode_batch_vectorized(
     pad_token_id: int,
     max_length: Optional[int] = None,
     truncation: bool = False,
-) -> Tuple[torch.Tensor, torch.Tensor]:
+    return_offsets_mapping: bool = False,
+    return_word_ids: bool = False,
+    force_max_length: bool = False,
+) -> Tuple[torch.Tensor, torch.Tensor, Optional[List], Optional[List]]:
     """
     Vectorized batch encoding - processes all sentences together.
-    Returns padded tensors directly.
+    Returns padded tensors directly, with optional offset mappings and word IDs.
+
+    Args:
+        force_max_length: If True and max_length is set, always return tensors of size max_length
     """
     all_ids = []
+    all_offsets = [] if return_offsets_mapping else None
+    all_word_ids = [] if return_word_ids else None
     max_len = 0
 
     # First pass: encode all sentences
     for sentence in sentences:
         ids = []
+        offsets = [] if return_offsets_mapping else None
+        word_ids = [] if return_word_ids else None
+
         words = sentence.split()
+        char_offset = 0
+
+        for word_idx, word in enumerate(words):
+            # Find the actual position of this word in the original sentence
+            word_start = sentence.find(word, char_offset)
+            word_end = word_start + len(word)
+            char_offset = word_end
+
+            # Get subword tokens for this word
+            subword_tokens = subword_cache.get(word)
 
-        for word in words:
-            ids.extend(subword_cache.get(word))
+            for token_id in subword_tokens:
+                ids.append(token_id)
 
+                if return_offsets_mapping:
+                    # All subword tokens of a word map to the word's character span
+                    offsets.append((word_start, word_end))
+
+                if return_word_ids:
+                    # All subword tokens of a word get the same word_id
+                    word_ids.append(word_idx)
+
+        # Add EOS token
         ids.append(eos_token_id)
+        if return_offsets_mapping:
+            offsets.append((len(sentence), len(sentence)))  # EOS has no span
+        if return_word_ids:
+            word_ids.append(None)  # EOS is not part of any word
 
         # Truncate if needed
         if truncation and max_length and len(ids) > max_length:
             ids = ids[:max_length]
+            if return_offsets_mapping:
+                offsets = offsets[:max_length]
+            if return_word_ids:
+                word_ids = word_ids[:max_length]
 
         all_ids.append(ids)
+        if return_offsets_mapping:
+            all_offsets.append(offsets)
+        if return_word_ids:
+            all_word_ids.append(word_ids)
         max_len = max(max_len, len(ids))
 
     # Determine final sequence length
-    if max_length and not truncation:
+    if force_max_length and max_length:
+        # Always use max_length when force_max_length is True
+        seq_len = max_length
+    elif max_length and not truncation:
         seq_len = min(max_len, max_length)
     elif max_length:
         seq_len = max_length
@@ -162,13 +208,22 @@ def encode_batch_vectorized(
     input_ids = torch.full((batch_size, seq_len), pad_token_id, dtype=torch.long)
     attention_mask = torch.zeros((batch_size, seq_len), dtype=torch.long)
 
-    # Fill tensors
+    # Fill tensors and pad metadata
     for i, ids in enumerate(all_ids):
         length = min(len(ids), seq_len)
         input_ids[i, :length] = torch.tensor(ids[:length], dtype=torch.long)
         attention_mask[i, :length] = 1
 
-    return input_ids, attention_mask
+        # Pad offsets and word_ids to match sequence length
+        if return_offsets_mapping:
+            # Pad with (0, 0) for padding tokens
+            all_offsets[i] = all_offsets[i][:length] + [(0, 0)] * (seq_len - length)
+
+        if return_word_ids:
+            # Pad with None for padding tokens
+            all_word_ids[i] = all_word_ids[i][:length] + [None] * (seq_len - length)
+
+    return input_ids, attention_mask, all_offsets, all_word_ids
 
 
 # ============================================================================
@@ -183,8 +238,7 @@ class NGramTokenizer(BaseTokenizer):
     - Vectorized batch encoding
     - Cached text normalization
     - Direct tensor operations
-    - No multiprocessing overhead
-    - No Numba dependency
+    - Optional offset mapping and word ID tracking
     """
 
     PAD_TOKEN = "[PAD]"
@@ -200,6 +254,7 @@ def __init__(
         len_word_ngrams: int,
         training_text: Optional[List[str]] = None,
         preprocess: bool = True,
+        output_dim: Optional[int] = None,
         **kwargs,
     ):
         if min_n < 2:
@@ -227,9 +282,11 @@ def __init__(
             self.subword_cache = None
 
         self.vocab_size = 3 + self.nwords + self.num_tokens
-        super().__init__(vocab_size=self.vocab_size)
+        super().__init__(
+            vocab_size=self.vocab_size, padding_idx=self.pad_token_id, output_dim=output_dim
+        )
 
-    def _build_vocab(self, training_text: List[str]):
+    def train(self, training_text: List[str]):
         """Build vocabulary from training text."""
         word_counts = {}
         for sent in training_text:
@@ -261,16 +318,24 @@ def _build_vocab(self, training_text: List[str]):
     def tokenize(
         self,
         text: Union[str, List[str]],
-        padding: str = "longest",
-        max_length: Optional[int] = None,
-        truncation: bool = False,
         return_offsets_mapping: bool = False,
         return_word_ids: bool = False,
         **kwargs,
     ) -> TokenizerOutput:
         """
         Optimized tokenization with vectorized operations.
-        Note: return_offsets_mapping and return_word_ids removed for speed.
+
+        Args:
+            text: Single string or list of strings to tokenize
+            padding: Padding strategy ('longest' or 'max_length')
+            max_length: Maximum sequence length
+            truncation: Whether to truncate sequences exceeding max_length
+            return_offsets_mapping: If True, return character offsets for each token
+            return_word_ids: If True, return word indices for each token
+
+        Returns:
+            TokenizerOutput with input_ids, attention_mask, and optionally
+            offset_mapping and word_ids
         """
         is_single = isinstance(text, str)
         if is_single:
@@ -280,21 +345,33 @@ def tokenize(
         if self.preprocess:
             text = clean_text_feature(text)
 
+        if self.output_dim is not None:
+            max_length = self.output_dim
+            truncation = True
+        else:
+            max_length = None
+            truncation = False
+
         # Vectorized encoding
-        input_ids, attention_mask = encode_batch_vectorized(
+        input_ids, attention_mask, offsets, word_ids = encode_batch_vectorized(
             text,
             self.subword_cache,
             self.eos_token_id,
             self.pad_token_id,
-            max_length=max_length if padding == "max_length" else None,
+            max_length=max_length,
             truncation=truncation,
+            return_offsets_mapping=return_offsets_mapping,
+            return_word_ids=return_word_ids,
         )
 
+        offsets = torch.tensor(offsets) if return_offsets_mapping else None
+        word_ids = np.array(word_ids) if return_word_ids else None
+
         return TokenizerOutput(
             input_ids=input_ids,
             attention_mask=attention_mask,
-            word_ids=None,
-            offset_mapping=None,
+            word_ids=word_ids,
+            offset_mapping=offsets,
         )
 
     def decode(

Original file line number	Diff line number	Diff line change
`@@ -6,4 +6,5 @@`
`6`	`6`	`HuggingFaceTokenizer as HuggingFaceTokenizer,`
`7`	`7`	`)`
`8`	`8`	`from .base import TokenizerOutput as TokenizerOutput`
	`9`	`+from .ngram import NGramTokenizer as NGramTokenizer`
`9`	`10`	`from .WordPiece import WordPieceTokenizer as WordPieceTokenizer`