From c1253ae1fa4c04cec5380c7e39ab838418d72c9d Mon Sep 17 00:00:00 2001 From: meilame-tayebjee Date: Mon, 13 Apr 2026 18:27:53 +0000 Subject: [PATCH 1/7] feat!(raw inputs): first working example with CategoricalValueEncoder - user must fit LabelEncoders or provide a mapping dictionary for each categorical variable - we embark this object with the wrapper. Now it accepts raw input at inference - tests are adapted --- tests/test_pipeline.py | 96 +++++++++----- .../categorical_value_encoder/__init__.py | 2 + .../categorical_value_encoder.py | 100 ++++++++++++++ torchTextClassifiers/torchTextClassifiers.py | 124 ++++++++++++------ 4 files changed, 247 insertions(+), 75 deletions(-) create mode 100644 torchTextClassifiers/categorical_value_encoder/__init__.py create mode 100644 torchTextClassifiers/categorical_value_encoder/categorical_value_encoder.py diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py index f5f9cab..94e1708 100644 --- a/tests/test_pipeline.py +++ b/tests/test_pipeline.py @@ -1,8 +1,10 @@ import numpy as np import pytest import torch +from sklearn.preprocessing import LabelEncoder from torchTextClassifiers import ModelConfig, TrainingConfig, torchTextClassifiers +from torchTextClassifiers.categorical_value_encoder import CategoricalValueEncoder, DictEncoder from torchTextClassifiers.dataset import TextClassificationDataset from torchTextClassifiers.model import TextClassificationModel, TextClassificationModule from torchTextClassifiers.model.components import ( @@ -13,7 +15,13 @@ TextEmbedder, TextEmbedderConfig, ) -from torchTextClassifiers.tokenizers import HuggingFaceTokenizer, NGramTokenizer, WordPieceTokenizer +from torchTextClassifiers.tokenizers import NGramTokenizer + +try: + from torchTextClassifiers.tokenizers import HuggingFaceTokenizer, WordPieceTokenizer +except ImportError: + pass + from torchTextClassifiers.utilities.plot_explainability import ( map_attributions_to_char, map_attributions_to_word, @@ -33,21 +41,31 @@ def sample_data(): "Good example here", "Bad example here", ] - categorical_data = np.array([[1, 0], [0, 1], [1, 0], [0, 1], [1, 0], [0, 1]]).astype(int) - labels = np.array([1, 0, 1, 0, 1, 5]) + # String categorical variables — two features, two unique values each + categorical_data = np.array( + [ + ["cat", "red"], + ["dog", "blue"], + ["cat", "red"], + ["dog", "blue"], + ["cat", "red"], + ["dog", "blue"], + ] + ) + # String labels + labels = np.array(["positive", "negative", "positive", "negative", "positive", "negative"]) return sample_text_data, categorical_data, labels @pytest.fixture def model_params(): - """Fixture providing common model parameters.""" + """Fixture providing common model parameters (class count and vocab sizes are + derived from data at runtime inside run_full_pipeline).""" return { "embedding_dim": 96, "n_layers": 2, "n_head": 4, - "num_classes": 10, - "categorical_vocab_sizes": [2, 2], "categorical_embedding_dims": [4, 7], } @@ -61,10 +79,28 @@ def run_full_pipeline( label_attention_enabled: bool = False, ): """Helper function to run the complete pipeline for a given tokenizer.""" - # Create dataset + + # --- Encode categorical variables (string → int) --- + n_features = categorical_data.shape[1] + encoders = { + str(i): DictEncoder( + {v: j for j, v in enumerate(sorted(set(categorical_data[:, i].tolist())))} + ) + for i in range(n_features) + } + cat_encoder = CategoricalValueEncoder(encoders) + encoded_categorical = cat_encoder.transform(categorical_data) + vocab_sizes = cat_encoder.vocabulary_sizes + + # --- Encode string labels to contiguous integers --- + label_encoder = LabelEncoder() + encoded_labels = label_encoder.fit_transform(labels) + num_classes = len(label_encoder.classes_) + + # --- Direct component test: dataset with already-encoded integers --- dataset = TextClassificationDataset( texts=sample_text_data, - categorical_variables=categorical_data.tolist(), + categorical_variables=encoded_categorical.tolist(), tokenizer=tokenizer, labels=None, ) @@ -94,7 +130,7 @@ def run_full_pipeline( label_attention_config=( LabelAttentionConfig( n_head=attention_config.n_head, - num_classes=model_params["num_classes"], + num_classes=num_classes, ) if label_attention_enabled else None @@ -104,9 +140,9 @@ def run_full_pipeline( text_embedder = TextEmbedder(text_embedder_config=text_embedder_config) text_embedder.init_weights() - # Create categorical variable net + # Create categorical variable net (vocab sizes from fitted encoder) categorical_var_net = CategoricalVariableNet( - categorical_vocabulary_sizes=model_params["categorical_vocab_sizes"], + categorical_vocabulary_sizes=vocab_sizes, categorical_embedding_dims=model_params["categorical_embedding_dims"], ) @@ -114,7 +150,7 @@ def run_full_pipeline( expected_input_dim = model_params["embedding_dim"] + categorical_var_net.output_dim classification_head = ClassificationHead( input_dim=expected_input_dim, - num_classes=model_params["num_classes"] if not label_attention_enabled else 1, + num_classes=num_classes if not label_attention_enabled else 1, ) # Create model @@ -141,34 +177,35 @@ def run_full_pipeline( # Test prediction module.predict_step(batch) - # Prepare data for training + # --- Wrapper pipeline with string categorical data --- + # X keeps categorical columns as raw strings; the wrapper encoder handles them. X = np.column_stack([sample_text_data, categorical_data]) - Y = labels + Y = encoded_labels # integer-encoded labels (from LabelEncoder) - # Create model config + # Create model config (vocab sizes and num_classes come from the encoders) model_config = ModelConfig( embedding_dim=model_params["embedding_dim"], - categorical_vocabulary_sizes=model_params["categorical_vocab_sizes"], + categorical_vocabulary_sizes=vocab_sizes, categorical_embedding_dims=model_params["categorical_embedding_dims"], - num_classes=model_params["num_classes"], + num_classes=num_classes, attention_config=attention_config, n_heads_label_attention=attention_config.n_head, ) - # Create training config training_config = TrainingConfig( lr=1e-3, batch_size=4, num_epochs=1, ) - # Create classifier + # Create classifier — pass the fitted categorical encoder ttc = torchTextClassifiers( tokenizer=tokenizer, model_config=model_config, + categorical_encoder=cat_encoder, ) - # Train + # Train with raw string categorical data ttc.train( X_train=X, y_train=Y, @@ -176,10 +213,11 @@ def run_full_pipeline( y_val=Y, training_config=training_config, ) - ttc.load(ttc.save_path) # test load + assert ttc.save_path is not None + ttc.load(ttc.save_path) # test load (encoder is also saved/restored) # Predict with explanations - top_k = 5 + top_k = min(5, num_classes) predictions = ttc.predict( X, @@ -197,7 +235,7 @@ def run_full_pipeline( expected_shape = ( len(sample_text_data), # batch_size model_params["n_head"], # n_head - model_params["num_classes"], # num_classes + num_classes, # num_classes (derived from label encoder) tokenizer.output_dim, # seq_len ) assert label_attention_attributions.shape == expected_shape, ( @@ -205,7 +243,6 @@ def run_full_pipeline( f"Expected {expected_shape}, got {label_attention_attributions.shape}" ) else: - # When label attention is not enabled, the attributions should be None assert ( predictions.get("label_attention_attributions") is None ), "Label attention attributions should be None when label_attention_enabled is False" @@ -220,8 +257,6 @@ def run_full_pipeline( words, word_attributions = map_attributions_to_word(attributions, text, word_ids, offsets) char_attributions = map_attributions_to_char(attributions, offsets, text) - # Note: We're not actually plotting in tests, just calling the functions - # to ensure they don't raise errors plot_attributions_at_char(text, char_attributions) plot_attributions_at_word( text=text, @@ -238,11 +273,9 @@ def test_wordpiece_tokenizer(sample_data, model_params): tokenizer = WordPieceTokenizer(vocab_size, output_dim=50) tokenizer.train(sample_text_data) - # Check tokenizer works result = tokenizer.tokenize(sample_text_data) assert result.input_ids.shape[0] == len(sample_text_data) - # Run full pipeline run_full_pipeline(tokenizer, sample_text_data, categorical_data, labels, model_params) @@ -254,11 +287,9 @@ def test_huggingface_tokenizer(sample_data, model_params): "google-bert/bert-base-uncased", output_dim=50 ) - # Check tokenizer works result = tokenizer.tokenize(sample_text_data) assert result.input_ids.shape[0] == len(sample_text_data) - # Run full pipeline run_full_pipeline(tokenizer, sample_text_data, categorical_data, labels, model_params) @@ -271,18 +302,15 @@ def test_ngram_tokenizer(sample_data, model_params): ) tokenizer.train(sample_text_data) - # Check tokenizer works result = tokenizer.tokenize( sample_text_data[0], return_offsets_mapping=True, return_word_ids=True ) assert result.input_ids is not None - # Check batch decode batch_result = tokenizer.tokenize(sample_text_data) decoded = tokenizer.batch_decode(batch_result.input_ids.tolist()) assert len(decoded) == len(sample_text_data) - # Run full pipeline run_full_pipeline(tokenizer, sample_text_data, categorical_data, labels, model_params) @@ -294,11 +322,9 @@ def test_label_attention_enabled(sample_data, model_params): tokenizer = WordPieceTokenizer(vocab_size, output_dim=50) tokenizer.train(sample_text_data) - # Check tokenizer works result = tokenizer.tokenize(sample_text_data) assert result.input_ids.shape[0] == len(sample_text_data) - # Run full pipeline with label attention enabled run_full_pipeline( tokenizer, sample_text_data, diff --git a/torchTextClassifiers/categorical_value_encoder/__init__.py b/torchTextClassifiers/categorical_value_encoder/__init__.py new file mode 100644 index 0000000..722e8c2 --- /dev/null +++ b/torchTextClassifiers/categorical_value_encoder/__init__.py @@ -0,0 +1,2 @@ +from .categorical_value_encoder import CategoricalValueEncoder as CategoricalValueEncoder +from .categorical_value_encoder import DictEncoder as DictEncoder diff --git a/torchTextClassifiers/categorical_value_encoder/categorical_value_encoder.py b/torchTextClassifiers/categorical_value_encoder/categorical_value_encoder.py new file mode 100644 index 0000000..dcfbaf8 --- /dev/null +++ b/torchTextClassifiers/categorical_value_encoder/categorical_value_encoder.py @@ -0,0 +1,100 @@ +import numpy as np +from sklearn.preprocessing import LabelEncoder + + +class DictEncoder: + def __init__(self, mapping: dict): + self.mapping: dict[str, int] = mapping + self.inverse_mapping: dict[int, str] = {v: k for k, v in mapping.items()} + + def __call__(self, value): + return self.mapping.get(value, None) + + def transform(self, col): + return self._dict_map(self.mapping, col) + + def inverse_transform(self, col): + return self._dict_map(self.inverse_mapping, col) + + @staticmethod + def _dict_map(dic, col): + if isinstance(col, np.ndarray): + return np.vectorize(dic.get)(col) + elif isinstance(col, list): + return [dic.get(v, None) for v in col] + else: + raise TypeError("Unsupported type for encoding: {}".format(type(col))) + + +class CategoricalValueEncoder: + """ + An object to encode raw categorical values into numerical indices. + + Initialized with pre-built DictEncoder or sklearn LabelEncoder instances, + one per categorical feature. + + Build encoders externally before passing them in: + - DictEncoder: provide a ``{value: index}`` mapping directly. + - sklearn LabelEncoder: call ``LabelEncoder().fit(column)`` per feature. + + Initialization: + - encoders: A dictionary mapping feature names to DictEncoder or LabelEncoder instances. + + Properties: + - vocabulary_sizes: List of vocabulary sizes (number of unique values) for each feature. + + Usage: + - transform(array): Encode a 2D array of shape (N, n_features) to integers. + - __call__(array): Alias for transform. + """ + + def __init__(self, encoders: dict[str, DictEncoder | LabelEncoder]): + self.encoders = encoders + + @property + def vocabulary_sizes(self) -> list[int]: + """Number of unique categories per feature, in order.""" + sizes = [] + for enc in self.encoders.values(): + if isinstance(enc, DictEncoder): + sizes.append(len(enc.mapping)) + elif hasattr(enc, "classes_"): + sizes.append(len(enc.classes_)) + else: + raise TypeError(f"Unsupported encoder type: {type(enc)}") + return sizes + + def transform(self, X_categorical: np.ndarray) -> np.ndarray: + """Encode all categorical columns to integer indices. + + Values are converted to strings before lookup. Unknown values raise a ValueError. + + Args: + X_categorical: Array of shape (N, n_features) with categorical values. + + Returns: + Integer-encoded array of shape (N, n_features), dtype int64. + + Raises: + ValueError: If any value was not seen during fitting. + """ + if X_categorical.ndim == 1: + X_categorical = X_categorical.reshape(-1, 1) + + result = np.empty(X_categorical.shape, dtype=np.int64) + for idx, (name, encoder) in enumerate(self.encoders.items()): + col = X_categorical[:, idx].astype(str) + encoded = encoder.transform(col) + try: + result[:, idx] = encoded.astype(np.int64) + except (TypeError, ValueError): + unknown = [v for v, e in zip(col.tolist(), encoded.tolist()) if e is None] + raise ValueError( + f"Unknown values in categorical feature '{name}': {unknown}. " + "These values were not seen during fitting." + ) + + return result + + def __call__(self, array: np.ndarray) -> np.ndarray: + return self.transform(array) diff --git a/torchTextClassifiers/torchTextClassifiers.py b/torchTextClassifiers/torchTextClassifiers.py index e38bb97..ca783d2 100644 --- a/torchTextClassifiers/torchTextClassifiers.py +++ b/torchTextClassifiers/torchTextClassifiers.py @@ -22,6 +22,7 @@ ModelCheckpoint, ) +from torchTextClassifiers.categorical_value_encoder import CategoricalValueEncoder from torchTextClassifiers.dataset import TextClassificationDataset from torchTextClassifiers.model import TextClassificationModel, TextClassificationModule from torchTextClassifiers.model.components import ( @@ -106,31 +107,44 @@ def __init__( tokenizer: BaseTokenizer, model_config: ModelConfig, ragged_multilabel: bool = False, + categorical_encoder: Optional[CategoricalValueEncoder] = None, ): """Initialize the torchTextClassifiers instance. Args: tokenizer: A tokenizer instance for text preprocessing model_config: Configuration parameters for the text classification model + ragged_multilabel: Whether to use ragged multilabel classification + categorical_encoder: Optional CategoricalValueEncoder for encoding + raw string (or mixed) categorical values to integers. Build it + beforehand from DictEncoder or sklearn LabelEncoder instances and + pass it here. If None, categorical columns in X must already be + integer-encoded. Example: >>> from torchTextClassifiers import ModelConfig, TrainingConfig, torchTextClassifiers - >>> # Assume tokenizer is a trained BaseTokenizer instance + >>> from torchTextClassifiers.categorical_value_encoder import CategoricalValueEncoder, DictEncoder + >>> # Build one DictEncoder per categorical feature + >>> encoders = {str(i): DictEncoder({v: j for j, v in enumerate(sorted(set(X_categorical[:, i])))}) + ... for i in range(X_categorical.shape[1])} + >>> encoder = CategoricalValueEncoder(encoders) >>> model_config = ModelConfig( ... embedding_dim=10, - ... categorical_vocabulary_sizes=[30, 25], + ... categorical_vocabulary_sizes=encoder.vocabulary_sizes, ... categorical_embedding_dims=[10, 5], ... num_classes=10, ... ) >>> ttc = torchTextClassifiers( ... tokenizer=tokenizer, ... model_config=model_config, + ... categorical_encoder=encoder, ... ) """ self.model_config = model_config self.tokenizer = tokenizer self.ragged_multilabel = ragged_multilabel + self.categorical_encoder: CategoricalValueEncoder | None = categorical_encoder if hasattr(self.tokenizer, "trained"): if not self.tokenizer.trained: @@ -240,26 +254,39 @@ def train( ... training_config=training_config, ... ) """ + + if X_train[:, 1:].dtype != np.int64: + if self.categorical_encoder is not None: + if verbose: + logger.info("Encoding categorical variables in training data...") + X_train[:, 1:] = self.categorical_encoder.transform(X_train[:, 1:]) + else: + raise ValueError( + "Categorical variables must be integer-encoded: provide a CategoricalValueEncoder." + ) + # Input validation - X_train, y_train = self._check_XY(X_train, y_train) + X_train_checked, y_train = self._check_XY(X_train, y_train) if X_val is not None: assert y_val is not None, "y_val must be provided if X_val is provided." if y_val is not None: assert X_val is not None, "X_val must be provided if y_val is provided." + X_val_checked: Optional[Dict[str, Any]] = None if X_val is not None and y_val is not None: - X_val, y_val = self._check_XY(X_val, y_val) + X_val_checked, y_val = self._check_XY(X_val, y_val) if ( - X_train["categorical_variables"] is not None - and X_val["categorical_variables"] is not None + X_train_checked["categorical_variables"] is not None + and X_val_checked is not None + and X_val_checked["categorical_variables"] is not None ): assert ( - X_train["categorical_variables"].ndim > 1 - and X_train["categorical_variables"].shape[1] - == X_val["categorical_variables"].shape[1] - or X_val["categorical_variables"].ndim == 1 + X_train_checked["categorical_variables"].ndim > 1 + and X_train_checked["categorical_variables"].shape[1] + == X_val_checked["categorical_variables"].shape[1] + or X_val_checked["categorical_variables"].ndim == 1 ), "X_train and X_val must have the same number of columns." if verbose: @@ -299,8 +326,8 @@ def train( logger.info(f"Running on: {device}") train_dataset = TextClassificationDataset( - texts=X_train["text"], - categorical_variables=X_train["categorical_variables"], # None if no cat vars + texts=X_train_checked["text"], + categorical_variables=X_train_checked["categorical_variables"], # None if no cat vars tokenizer=self.tokenizer, labels=y_train.tolist(), ragged_multilabel=self.ragged_multilabel, @@ -312,10 +339,10 @@ def train( **training_config.dataloader_params if training_config.dataloader_params else {}, ) - if X_val is not None and y_val is not None: + if X_val_checked is not None and y_val is not None: val_dataset = TextClassificationDataset( - texts=X_val["text"], - categorical_variables=X_val["categorical_variables"], # None if no cat vars + texts=X_val_checked["text"], + categorical_variables=X_val_checked["categorical_variables"], # None if no cat vars tokenizer=self.tokenizer, labels=y_val, ragged_multilabel=self.ragged_multilabel, @@ -390,14 +417,14 @@ def train( self.lightning_module.eval() - def _check_XY(self, X: np.ndarray, Y: np.ndarray) -> Tuple[np.ndarray, np.ndarray]: - X = self._check_X(X) - Y = self._check_Y(Y) + def _check_XY(self, X: np.ndarray, Y: np.ndarray) -> Tuple[Dict[str, Any], np.ndarray]: + X_checked = self._check_X(X) + Y_checked = self._check_Y(Y) - if X["text"].shape[0] != len(Y): + if X_checked["text"].shape[0] != len(Y_checked): raise ValueError("X_train and y_train must have the same number of observations.") - return X, Y + return X_checked, Y_checked @staticmethod def _check_text_col(X): @@ -415,46 +442,48 @@ def _check_text_col(X): return text - def _check_categorical_variables(self, X: np.ndarray) -> None: - """Check if categorical variables in X match training configuration. + def _check_categorical_variables(self, X: np.ndarray) -> np.ndarray: + """Validate and encode categorical variables from X. + + If a ``categorical_encoder`` was provided at initialization, raw string + or mixed values are encoded to integers via that encoder. Otherwise the + categorical columns must already be integer-encodable. Args: - X: Input data to check + X: Full input array whose first column is text and whose remaining + columns are categorical variables. + + Returns: + Integer-encoded categorical array of shape (N, n_cat_features). Raises: - ValueError: If the number of categorical variables does not match - the training configuration + ValueError: If the number of categorical features does not match the + model configuration, if values exceed vocabulary bounds, or if + values cannot be cast to integers and no encoder was provided. """ - assert self.categorical_var_net is not None - if X.ndim > 1: - num_cat_vars = X.shape[1] - 1 - else: - num_cat_vars = 0 + num_cat_vars = X.shape[1] - 1 if X.ndim > 1 else 0 if num_cat_vars != self.categorical_var_net.num_categorical_features: raise ValueError( - f"X must have the same number of categorical variables as the number of embedding layers in the categorical net: ({self.categorical_var_net.num_categorical_features})." + f"X must have the same number of categorical variables as the number of " + f"embedding layers in the categorical net: ({self.categorical_var_net.num_categorical_features})." ) - try: - categorical_variables = X[:, 1:].astype(int) - except ValueError: - logger.error( - f"Columns {1} to {X.shape[1] - 1} of X_train must be castable in integer format." - ) + categorical_variables = X[:, 1:].astype(int) - for j in range(X.shape[1] - 1): + for j in range(num_cat_vars): max_cat_value = categorical_variables[:, j].max() if max_cat_value >= self.categorical_var_net.categorical_vocabulary_sizes[j]: raise ValueError( - f"Categorical variable at index {j} has value {max_cat_value} which exceeds the vocabulary size of {self.categorical_var_net.categorical_vocabulary_sizes[j]}." + f"Categorical variable at index {j} has value {max_cat_value} which exceeds " + f"the vocabulary size of {self.categorical_var_net.categorical_vocabulary_sizes[j]}." ) return categorical_variables - def _check_X(self, X: np.ndarray) -> np.ndarray: + def _check_X(self, X: np.ndarray) -> Dict[str, Any]: text = self._check_text_col(X) categorical_variables = None @@ -657,6 +686,7 @@ def save(self, path: Union[str, Path]) -> None: "num_classes": self.num_classes, "checkpoint_path": str(checkpoint_path) if checkpoint_path else None, "device": str(self.device) if hasattr(self, "device") else None, + "has_categorical_encoder": self.categorical_encoder is not None, } # Save metadata @@ -668,6 +698,11 @@ def save(self, path: Union[str, Path]) -> None: with open(tokenizer_path, "wb") as f: pickle.dump(self.tokenizer, f) + # Save categorical encoder if present + if self.categorical_encoder is not None: + with open(path / "categorical_encoder.pkl", "wb") as f: + pickle.dump(self.categorical_encoder, f) + logger.info(f"Model saved successfully to {path}") @classmethod @@ -701,11 +736,20 @@ def load(cls, path: Union[str, Path], device: str = "auto") -> "torchTextClassif # Reconstruct model_config model_config = ModelConfig.from_dict(metadata["model_config"]) + # Load categorical encoder if one was saved + categorical_encoder = None + if metadata.get("has_categorical_encoder"): + encoder_path = path / "categorical_encoder.pkl" + if encoder_path.exists(): + with open(encoder_path, "rb") as f: + categorical_encoder = pickle.load(f) + # Create instance instance = cls( tokenizer=tokenizer, model_config=model_config, ragged_multilabel=metadata["ragged_multilabel"], + categorical_encoder=categorical_encoder, ) # Set device From 4cb8632b8fce87c3425937fa6ff0396d300213a6 Mon Sep 17 00:00:00 2001 From: meilame-tayebjee Date: Wed, 15 Apr 2026 16:26:16 +0000 Subject: [PATCH 2/7] fix: hardcoded value --- torchTextClassifiers/dataset/dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torchTextClassifiers/dataset/dataset.py b/torchTextClassifiers/dataset/dataset.py index c4f6a83..6475567 100644 --- a/torchTextClassifiers/dataset/dataset.py +++ b/torchTextClassifiers/dataset/dataset.py @@ -90,7 +90,7 @@ def collate_fn(self, batch): padding_value=-1, # use impossible class ).int() - labels_tensor = torch.zeros(labels_padded.size(0), 6).float() + labels_tensor = torch.zeros(labels_padded.size(0), self.num_classes).float() mask = labels_padded != -1 batch_size = labels_padded.size(0) From 08dc4e3459f799f30afee15a9a2ad8fee62a8372 Mon Sep 17 00:00:00 2001 From: meilame-tayebjee Date: Wed, 15 Apr 2026 16:28:12 +0000 Subject: [PATCH 3/7] feat!(value_encoder): stabilize first implem for value_encoder - take a step back, add label encoder vs categroical encoder - adapted checkers at training time - train is working, need to adapt predict - saving and loading is working - incidentally solving #77 --- .../categorical_value_encoder/__init__.py | 2 +- .../categorical_value_encoder.py | 91 ++++++++--- torchTextClassifiers/torchTextClassifiers.py | 144 +++++++++++------- 3 files changed, 160 insertions(+), 77 deletions(-) diff --git a/torchTextClassifiers/categorical_value_encoder/__init__.py b/torchTextClassifiers/categorical_value_encoder/__init__.py index 722e8c2..7152162 100644 --- a/torchTextClassifiers/categorical_value_encoder/__init__.py +++ b/torchTextClassifiers/categorical_value_encoder/__init__.py @@ -1,2 +1,2 @@ -from .categorical_value_encoder import CategoricalValueEncoder as CategoricalValueEncoder from .categorical_value_encoder import DictEncoder as DictEncoder +from .categorical_value_encoder import ValueEncoder as ValueEncoder diff --git a/torchTextClassifiers/categorical_value_encoder/categorical_value_encoder.py b/torchTextClassifiers/categorical_value_encoder/categorical_value_encoder.py index dcfbaf8..e653721 100644 --- a/torchTextClassifiers/categorical_value_encoder/categorical_value_encoder.py +++ b/torchTextClassifiers/categorical_value_encoder/categorical_value_encoder.py @@ -1,3 +1,5 @@ +from typing import Optional + import numpy as np from sklearn.preprocessing import LabelEncoder @@ -25,44 +27,71 @@ def _dict_map(dic, col): else: raise TypeError("Unsupported type for encoding: {}".format(type(col))) + @property + def vocabulary_size(self): + return len(self.mapping) + -class CategoricalValueEncoder: +class ValueEncoder: """ An object to encode raw categorical values into numerical indices. - Initialized with pre-built DictEncoder or sklearn LabelEncoder instances, - one per categorical feature. - Build encoders externally before passing them in: - DictEncoder: provide a ``{value: index}`` mapping directly. - sklearn LabelEncoder: call ``LabelEncoder().fit(column)`` per feature. Initialization: - - encoders: A dictionary mapping feature names to DictEncoder or LabelEncoder instances. + - label_encoder: A DictEncoder or LabelEncoder instance for encoding labels. + - encoders (optional): A dictionary mapping feature names to DictEncoder or LabelEncoder instances. Properties: - vocabulary_sizes: List of vocabulary sizes (number of unique values) for each feature. + - num_classes: Number of unique classes in the label encoder. Usage: - transform(array): Encode a 2D array of shape (N, n_features) to integers. - __call__(array): Alias for transform. """ - def __init__(self, encoders: dict[str, DictEncoder | LabelEncoder]): - self.encoders = encoders + def __init__( + self, + label_encoder: DictEncoder | LabelEncoder, + categorical_encoders: Optional[dict[str, DictEncoder | LabelEncoder]] = None, + ): + self.categorical_encoders = categorical_encoders + + if not isinstance(label_encoder, (DictEncoder, LabelEncoder)): + raise TypeError( + f"label_encoder must be a DictEncoder or LabelEncoder instance, got {type(label_encoder)}" + ) + self.label_encoder = label_encoder @property def vocabulary_sizes(self) -> list[int]: """Number of unique categories per feature, in order.""" - sizes = [] - for enc in self.encoders.values(): - if isinstance(enc, DictEncoder): - sizes.append(len(enc.mapping)) - elif hasattr(enc, "classes_"): - sizes.append(len(enc.classes_)) - else: - raise TypeError(f"Unsupported encoder type: {type(enc)}") - return sizes + + if self.categorical_encoders is None: + return None + else: + sizes = [] + for enc in self.categorical_encoders.values(): + if isinstance(enc, DictEncoder): + sizes.append(len(enc.mapping)) + elif hasattr(enc, "classes_"): + sizes.append(len(enc.classes_)) + else: + raise TypeError(f"Unsupported encoder type: {type(enc)}") + return sizes + + @property + def num_classes(self) -> int: + """Number of unique classes in the label encoder, if provided.""" + if isinstance(self.label_encoder, DictEncoder): + return len(self.label_encoder.mapping) + elif hasattr(self.label_encoder, "classes_"): + return len(self.label_encoder.classes_) + else: + raise TypeError(f"Unsupported label encoder type: {type(self.label_encoder)}") def transform(self, X_categorical: np.ndarray) -> np.ndarray: """Encode all categorical columns to integer indices. @@ -78,11 +107,15 @@ def transform(self, X_categorical: np.ndarray) -> np.ndarray: Raises: ValueError: If any value was not seen during fitting. """ + + if self.categorical_encoders is None: + raise ValueError("No categorical encoders provided. Cannot transform data.") + if X_categorical.ndim == 1: X_categorical = X_categorical.reshape(-1, 1) result = np.empty(X_categorical.shape, dtype=np.int64) - for idx, (name, encoder) in enumerate(self.encoders.items()): + for idx, (name, encoder) in enumerate(self.categorical_encoders.items()): col = X_categorical[:, idx].astype(str) encoded = encoder.transform(col) try: @@ -96,5 +129,29 @@ def transform(self, X_categorical: np.ndarray) -> np.ndarray: return result + def transform_labels(self, y_labels: np.ndarray) -> np.ndarray: + """Encode label array to integer indices. + + Values are converted to strings before lookup. Unknown values raise a ValueError. + + Args: + y_labels: Array of shape (N,) with label values. + Returns: + Integer-encoded array of shape (N,), dtype int64. + Raises: + ValueError: If any label value was not seen during fitting. + """ + + col = y_labels.astype(str) + encoded = self.label_encoder.transform(col) + try: + return encoded.astype(np.int64) + except (TypeError, ValueError): + unknown = [v for v, e in zip(col.tolist(), encoded.tolist()) if e is None] + raise ValueError( + f"Unknown values in label encoder: {unknown}. " + "These values were not seen during fitting." + ) + def __call__(self, array: np.ndarray) -> np.ndarray: return self.transform(array) diff --git a/torchTextClassifiers/torchTextClassifiers.py b/torchTextClassifiers/torchTextClassifiers.py index ca783d2..10b21c6 100644 --- a/torchTextClassifiers/torchTextClassifiers.py +++ b/torchTextClassifiers/torchTextClassifiers.py @@ -22,7 +22,7 @@ ModelCheckpoint, ) -from torchTextClassifiers.categorical_value_encoder import CategoricalValueEncoder +from torchTextClassifiers.categorical_value_encoder import ValueEncoder from torchTextClassifiers.dataset import TextClassificationDataset from torchTextClassifiers.model import TextClassificationModel, TextClassificationModule from torchTextClassifiers.model.components import ( @@ -51,7 +51,7 @@ class ModelConfig: """Base configuration class for text classifiers.""" embedding_dim: int - num_classes: int + num_classes: Optional[int] = None categorical_vocabulary_sizes: Optional[List[int]] = None categorical_embedding_dims: Optional[Union[List[int], int]] = None attention_config: Optional[AttentionConfig] = None @@ -70,6 +70,8 @@ class TrainingConfig: num_epochs: int batch_size: int lr: float + raw_categorical_inputs: Optional[bool] = True + raw_labels: Optional[bool] = True loss: torch.nn.Module = field(default_factory=lambda: torch.nn.CrossEntropyLoss()) optimizer: Type[torch.optim.Optimizer] = torch.optim.Adam scheduler: Optional[Type[torch.optim.lr_scheduler._LRScheduler]] = None @@ -107,7 +109,7 @@ def __init__( tokenizer: BaseTokenizer, model_config: ModelConfig, ragged_multilabel: bool = False, - categorical_encoder: Optional[CategoricalValueEncoder] = None, + value_encoder: Optional[ValueEncoder] = None, ): """Initialize the torchTextClassifiers instance. @@ -115,7 +117,7 @@ def __init__( tokenizer: A tokenizer instance for text preprocessing model_config: Configuration parameters for the text classification model ragged_multilabel: Whether to use ragged multilabel classification - categorical_encoder: Optional CategoricalValueEncoder for encoding + value_encoder: Optional ValueEncoder for encoding raw string (or mixed) categorical values to integers. Build it beforehand from DictEncoder or sklearn LabelEncoder instances and pass it here. If None, categorical columns in X must already be @@ -123,11 +125,11 @@ def __init__( Example: >>> from torchTextClassifiers import ModelConfig, TrainingConfig, torchTextClassifiers - >>> from torchTextClassifiers.categorical_value_encoder import CategoricalValueEncoder, DictEncoder + >>> from torchTextClassifiers.categorical_value_encoder import ValueEncoder, DictEncoder >>> # Build one DictEncoder per categorical feature >>> encoders = {str(i): DictEncoder({v: j for j, v in enumerate(sorted(set(X_categorical[:, i])))}) ... for i in range(X_categorical.shape[1])} - >>> encoder = CategoricalValueEncoder(encoders) + >>> encoder = ValueEncoder(encoders) >>> model_config = ModelConfig( ... embedding_dim=10, ... categorical_vocabulary_sizes=encoder.vocabulary_sizes, @@ -137,14 +139,14 @@ def __init__( >>> ttc = torchTextClassifiers( ... tokenizer=tokenizer, ... model_config=model_config, - ... categorical_encoder=encoder, + ... value_encoder=encoder, ... ) """ self.model_config = model_config self.tokenizer = tokenizer self.ragged_multilabel = ragged_multilabel - self.categorical_encoder: CategoricalValueEncoder | None = categorical_encoder + self.value_encoder: ValueEncoder | None = value_encoder if hasattr(self.tokenizer, "trained"): if not self.tokenizer.trained: @@ -154,8 +156,24 @@ def __init__( self.vocab_size = tokenizer.vocab_size self.embedding_dim = model_config.embedding_dim - self.categorical_vocabulary_sizes = model_config.categorical_vocabulary_sizes - self.num_classes = model_config.num_classes + + if self.value_encoder is not None: + if (model_config.num_classes != self.value_encoder.num_classes) or ( + model_config.categorical_vocabulary_sizes != self.value_encoder.vocabulary_sizes + ): + logger.info( + "Overriding model_config num_classes and/or categorical_vocabulary_sizes with values from value_encoder." + ) + self.categorical_vocabulary_sizes = self.value_encoder.vocabulary_sizes + self.num_classes = self.value_encoder.num_classes + else: + self.categorical_vocabulary_sizes = model_config.categorical_vocabulary_sizes + if model_config.num_classes is None: + raise ValueError( + "num_classes must be specified in the model configuration if no value_encoder is provided." + ) + self.num_classes = model_config.num_classes + self.enable_label_attention = model_config.n_heads_label_attention is not None if self.tokenizer.output_vectorized: @@ -173,7 +191,9 @@ def __init__( label_attention_config=LabelAttentionConfig( n_head=model_config.n_heads_label_attention, num_classes=model_config.num_classes, - ), + ) + if self.enable_label_attention + else None, ) self.text_embedder = TextEmbedder( text_embedder_config=text_embedder_config, @@ -197,7 +217,7 @@ def __init__( input_dim=classif_head_input_dim, num_classes=1 if self.enable_label_attention - else model_config.num_classes, # output dim is 1 when using label attention, because embeddings are (num_classes, embedding_dim) + else self.num_classes, # output dim is 1 when using label attention, because embeddings are (num_classes, embedding_dim) ) self.pytorch_model = TextClassificationModel( @@ -255,38 +275,30 @@ def train( ... ) """ - if X_train[:, 1:].dtype != np.int64: - if self.categorical_encoder is not None: - if verbose: - logger.info("Encoding categorical variables in training data...") - X_train[:, 1:] = self.categorical_encoder.transform(X_train[:, 1:]) - else: - raise ValueError( - "Categorical variables must be integer-encoded: provide a CategoricalValueEncoder." - ) - # Input validation - X_train_checked, y_train = self._check_XY(X_train, y_train) - + X_train, y_train = self._check_XY( + X_train, y_train, training_config.raw_categorical_inputs, training_config.raw_labels + ) + print(X_train, y_train) if X_val is not None: assert y_val is not None, "y_val must be provided if X_val is provided." if y_val is not None: assert X_val is not None, "X_val must be provided if y_val is provided." - X_val_checked: Optional[Dict[str, Any]] = None + X_val: Optional[Dict[str, Any]] = None if X_val is not None and y_val is not None: - X_val_checked, y_val = self._check_XY(X_val, y_val) + X_val, y_val = self._check_XY(X_val, y_val) if ( - X_train_checked["categorical_variables"] is not None - and X_val_checked is not None - and X_val_checked["categorical_variables"] is not None + X_train["categorical_variables"] is not None + and X_val is not None + and X_val["categorical_variables"] is not None ): assert ( - X_train_checked["categorical_variables"].ndim > 1 - and X_train_checked["categorical_variables"].shape[1] - == X_val_checked["categorical_variables"].shape[1] - or X_val_checked["categorical_variables"].ndim == 1 + X_train["categorical_variables"].ndim > 1 + and X_train["categorical_variables"].shape[1] + == X_val["categorical_variables"].shape[1] + or X_val["categorical_variables"].ndim == 1 ), "X_train and X_val must have the same number of columns." if verbose: @@ -326,8 +338,8 @@ def train( logger.info(f"Running on: {device}") train_dataset = TextClassificationDataset( - texts=X_train_checked["text"], - categorical_variables=X_train_checked["categorical_variables"], # None if no cat vars + texts=X_train["text"], + categorical_variables=X_train["categorical_variables"], # None if no cat vars tokenizer=self.tokenizer, labels=y_train.tolist(), ragged_multilabel=self.ragged_multilabel, @@ -339,10 +351,10 @@ def train( **training_config.dataloader_params if training_config.dataloader_params else {}, ) - if X_val_checked is not None and y_val is not None: + if X_val is not None and y_val is not None: val_dataset = TextClassificationDataset( - texts=X_val_checked["text"], - categorical_variables=X_val_checked["categorical_variables"], # None if no cat vars + texts=X_val["text"], + categorical_variables=X_val["categorical_variables"], # None if no cat vars tokenizer=self.tokenizer, labels=y_val, ragged_multilabel=self.ragged_multilabel, @@ -417,9 +429,11 @@ def train( self.lightning_module.eval() - def _check_XY(self, X: np.ndarray, Y: np.ndarray) -> Tuple[Dict[str, Any], np.ndarray]: - X_checked = self._check_X(X) - Y_checked = self._check_Y(Y) + def _check_XY( + self, X: np.ndarray, Y: np.ndarray, raw_categorical_inputs, raw_labels + ) -> Tuple[Dict[str, Any], np.ndarray]: + X_checked = self._check_X(X, raw_categorical_inputs) + Y_checked = self._check_Y(Y, raw_labels) if X_checked["text"].shape[0] != len(Y_checked): raise ValueError("X_train and y_train must have the same number of observations.") @@ -442,10 +456,12 @@ def _check_text_col(X): return text - def _check_categorical_variables(self, X: np.ndarray) -> np.ndarray: + def _check_categorical_variables( + self, X: np.ndarray, raw_categorical_inputs: bool + ) -> np.ndarray: """Validate and encode categorical variables from X. - If a ``categorical_encoder`` was provided at initialization, raw string + If a ``value_encoder`` was provided at initialization, raw string or mixed values are encoded to integers via that encoder. Otherwise the categorical columns must already be integer-encodable. @@ -471,6 +487,13 @@ def _check_categorical_variables(self, X: np.ndarray) -> np.ndarray: f"embedding layers in the categorical net: ({self.categorical_var_net.num_categorical_features})." ) + if raw_categorical_inputs: + if self.value_encoder is None: + raise ValueError( + "Raw categorical input encoding is enabled, but no value_encoder was provided. Please provide a ValueEncoder to encode raw categorical values to integers." + ) + X[:, 1:] = self.value_encoder.transform(X[:, 1:]) + categorical_variables = X[:, 1:].astype(int) for j in range(num_cat_vars): @@ -483,16 +506,16 @@ def _check_categorical_variables(self, X: np.ndarray) -> np.ndarray: return categorical_variables - def _check_X(self, X: np.ndarray) -> Dict[str, Any]: + def _check_X(self, X: np.ndarray, raw_categorical_inputs: bool) -> Dict[str, Any]: text = self._check_text_col(X) categorical_variables = None if self.categorical_var_net is not None: - categorical_variables = self._check_categorical_variables(X) + categorical_variables = self._check_categorical_variables(X, raw_categorical_inputs) return {"text": text, "categorical_variables": categorical_variables} - def _check_Y(self, Y): + def _check_Y(self, Y, raw_labels: bool) -> np.ndarray: if self.ragged_multilabel: assert isinstance( Y, list @@ -508,10 +531,13 @@ def _check_Y(self, Y): len(Y.shape) == 1 or len(Y.shape) == 2 ), "Y must be a numpy array of shape (N,) or (N, num_labels)." - try: - Y = Y.astype(int) - except ValueError: - logger.error("Y must be castable in integer format.") + if raw_labels: + if self.value_encoder is None: + raise ValueError( + "Raw label encoding is enabled, but no value_encoder was provided. Please provide a ValueEncoder to encode raw labels to integers." + ) + Y = self.value_encoder.transform_labels(Y) + Y = Y.astype(int) if Y.max() >= self.num_classes or Y.min() < 0: raise ValueError( @@ -686,7 +712,7 @@ def save(self, path: Union[str, Path]) -> None: "num_classes": self.num_classes, "checkpoint_path": str(checkpoint_path) if checkpoint_path else None, "device": str(self.device) if hasattr(self, "device") else None, - "has_categorical_encoder": self.categorical_encoder is not None, + "has_value_encoder": self.value_encoder is not None, } # Save metadata @@ -699,9 +725,9 @@ def save(self, path: Union[str, Path]) -> None: pickle.dump(self.tokenizer, f) # Save categorical encoder if present - if self.categorical_encoder is not None: - with open(path / "categorical_encoder.pkl", "wb") as f: - pickle.dump(self.categorical_encoder, f) + if self.value_encoder is not None: + with open(path / "value_encoder.pkl", "wb") as f: + pickle.dump(self.value_encoder, f) logger.info(f"Model saved successfully to {path}") @@ -737,19 +763,19 @@ def load(cls, path: Union[str, Path], device: str = "auto") -> "torchTextClassif model_config = ModelConfig.from_dict(metadata["model_config"]) # Load categorical encoder if one was saved - categorical_encoder = None - if metadata.get("has_categorical_encoder"): - encoder_path = path / "categorical_encoder.pkl" + value_encoder = None + if metadata.get("has_value_encoder"): + encoder_path = path / "value_encoder.pkl" if encoder_path.exists(): with open(encoder_path, "rb") as f: - categorical_encoder = pickle.load(f) + value_encoder = pickle.load(f) # Create instance instance = cls( tokenizer=tokenizer, model_config=model_config, ragged_multilabel=metadata["ragged_multilabel"], - categorical_encoder=categorical_encoder, + value_encoder=value_encoder, ) # Set device From 9aff671800cac19ca3eade11c1e31b1b2ee66483 Mon Sep 17 00:00:00 2001 From: meilame-tayebjee Date: Thu, 16 Apr 2026 09:24:06 +0000 Subject: [PATCH 4/7] chore: rename file --- .../categorical_value_encoder/__init__.py | 2 -- .../value_encoder/__init__.py | 2 ++ .../value_encoder.py} | 27 +++++++++++++++++-- 3 files changed, 27 insertions(+), 4 deletions(-) delete mode 100644 torchTextClassifiers/categorical_value_encoder/__init__.py create mode 100644 torchTextClassifiers/value_encoder/__init__.py rename torchTextClassifiers/{categorical_value_encoder/categorical_value_encoder.py => value_encoder/value_encoder.py} (83%) diff --git a/torchTextClassifiers/categorical_value_encoder/__init__.py b/torchTextClassifiers/categorical_value_encoder/__init__.py deleted file mode 100644 index 7152162..0000000 --- a/torchTextClassifiers/categorical_value_encoder/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -from .categorical_value_encoder import DictEncoder as DictEncoder -from .categorical_value_encoder import ValueEncoder as ValueEncoder diff --git a/torchTextClassifiers/value_encoder/__init__.py b/torchTextClassifiers/value_encoder/__init__.py new file mode 100644 index 0000000..0351bde --- /dev/null +++ b/torchTextClassifiers/value_encoder/__init__.py @@ -0,0 +1,2 @@ +from .value_encoder import DictEncoder as DictEncoder +from .value_encoder import ValueEncoder as ValueEncoder diff --git a/torchTextClassifiers/categorical_value_encoder/categorical_value_encoder.py b/torchTextClassifiers/value_encoder/value_encoder.py similarity index 83% rename from torchTextClassifiers/categorical_value_encoder/categorical_value_encoder.py rename to torchTextClassifiers/value_encoder/value_encoder.py index e653721..8215093 100644 --- a/torchTextClassifiers/categorical_value_encoder/categorical_value_encoder.py +++ b/torchTextClassifiers/value_encoder/value_encoder.py @@ -42,7 +42,8 @@ class ValueEncoder: Initialization: - label_encoder: A DictEncoder or LabelEncoder instance for encoding labels. - - encoders (optional): A dictionary mapping feature names to DictEncoder or LabelEncoder instances. + - encoders (optional): A dictionary mapping feature names to DictEncoder or + LabelEncoder instances. Properties: - vocabulary_sizes: List of vocabulary sizes (number of unique values) for each feature. @@ -62,7 +63,8 @@ def __init__( if not isinstance(label_encoder, (DictEncoder, LabelEncoder)): raise TypeError( - f"label_encoder must be a DictEncoder or LabelEncoder instance, got {type(label_encoder)}" + "label_encoder must be a DictEncoder or LabelEncoder instance, " + f"got {type(label_encoder)}" ) self.label_encoder = label_encoder @@ -153,5 +155,26 @@ def transform_labels(self, y_labels: np.ndarray) -> np.ndarray: "These values were not seen during fitting." ) + def inverse_transform_labels(self, y_encoded: np.ndarray) -> np.ndarray: + """Decode integer-encoded labels back to original values. + + Args: + y_encoded: Array of shape (N,) with integer-encoded labels. + Returns: + Array of shape (N,) with original label values. + Raises: + ValueError: If any encoded label value was not seen during fitting. + """ + + if isinstance(self.label_encoder, DictEncoder): + inverse_mapping = self.label_encoder.inverse_mapping + return np.vectorize(inverse_mapping.get)(y_encoded) + elif hasattr(self.label_encoder, "inverse_transform"): + shape = y_encoded.shape + result = self.label_encoder.inverse_transform(y_encoded.ravel()) + return result.reshape(shape) if len(shape) > 1 else result + else: + raise TypeError(f"Unsupported label encoder type: {type(self.label_encoder)}") + def __call__(self, array: np.ndarray) -> np.ndarray: return self.transform(array) From 1ff499b4ce66b3e0b7a90de733c465097211f065 Mon Sep 17 00:00:00 2001 From: meilame-tayebjee Date: Thu, 16 Apr 2026 09:27:35 +0000 Subject: [PATCH 5/7] feat: add label desencoding after prediction --- torchTextClassifiers/torchTextClassifiers.py | 22 +++++++++++++------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/torchTextClassifiers/torchTextClassifiers.py b/torchTextClassifiers/torchTextClassifiers.py index 10b21c6..fea5194 100644 --- a/torchTextClassifiers/torchTextClassifiers.py +++ b/torchTextClassifiers/torchTextClassifiers.py @@ -22,7 +22,6 @@ ModelCheckpoint, ) -from torchTextClassifiers.categorical_value_encoder import ValueEncoder from torchTextClassifiers.dataset import TextClassificationDataset from torchTextClassifiers.model import TextClassificationModel, TextClassificationModule from torchTextClassifiers.model.components import ( @@ -35,6 +34,7 @@ TextEmbedderConfig, ) from torchTextClassifiers.tokenizers import BaseTokenizer, TokenizerOutput +from torchTextClassifiers.value_encoder import ValueEncoder logger = logging.getLogger(__name__) @@ -125,7 +125,7 @@ def __init__( Example: >>> from torchTextClassifiers import ModelConfig, TrainingConfig, torchTextClassifiers - >>> from torchTextClassifiers.categorical_value_encoder import ValueEncoder, DictEncoder + >>> from torchTextClassifiers.value_encoder import ValueEncoder, DictEncoder >>> # Build one DictEncoder per categorical feature >>> encoders = {str(i): DictEncoder({v: j for j, v in enumerate(sorted(set(X_categorical[:, i])))}) ... for i in range(X_categorical.shape[1])} @@ -492,9 +492,9 @@ def _check_categorical_variables( raise ValueError( "Raw categorical input encoding is enabled, but no value_encoder was provided. Please provide a ValueEncoder to encode raw categorical values to integers." ) - X[:, 1:] = self.value_encoder.transform(X[:, 1:]) - - categorical_variables = X[:, 1:].astype(int) + categorical_variables = self.value_encoder.transform(X[:, 1:]).astype(int) + else: + categorical_variables = X[:, 1:].astype(int) for j in range(num_cat_vars): max_cat_value = categorical_variables[:, j].max() @@ -549,6 +549,7 @@ def _check_Y(self, Y, raw_labels: bool) -> np.ndarray: def predict( self, X_test: np.ndarray, + raw_categorical_inputs: bool = True, top_k=1, explain_with_label_attention: bool = False, explain_with_captum=False, @@ -593,7 +594,7 @@ def predict( return_offsets_mapping = False return_word_ids = False - X_test = self._check_X(X_test) + X_test = self._check_X(X_test, raw_categorical_inputs) text = X_test["text"] categorical_variables = X_test["categorical_variables"] @@ -638,7 +639,12 @@ def predict( label_scores_topk = torch.topk(label_scores, k=top_k, dim=1) - predictions = label_scores_topk.indices # get the top_k most likely predictions + integer_predictions = label_scores_topk.indices # integer class indices (needed for captum) + if self.value_encoder is not None: + predictions = self.value_encoder.inverse_transform_labels(integer_predictions.numpy()) + else: + predictions = integer_predictions + confidence = torch.round(label_scores_topk.values, decimals=2) # and their scores if explain: @@ -648,7 +654,7 @@ def predict( for k in range(top_k): attributions = lig.attribute( (encoded_text, attention_mask, categorical_vars), - target=torch.Tensor(predictions[:, k]).long(), + target=integer_predictions[:, k], ) # (batch_size, seq_len) attributions = attributions.sum(dim=-1) captum_attributions.append(attributions.detach().cpu()) From a2572ade4d8b22040a5fbd257ae28decd18fd7c6 Mon Sep 17 00:00:00 2001 From: meilame-tayebjee Date: Thu, 16 Apr 2026 09:27:52 +0000 Subject: [PATCH 6/7] fix: fix tests to new value encoder --- tests/test_pipeline.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py index 94e1708..d998989 100644 --- a/tests/test_pipeline.py +++ b/tests/test_pipeline.py @@ -4,7 +4,6 @@ from sklearn.preprocessing import LabelEncoder from torchTextClassifiers import ModelConfig, TrainingConfig, torchTextClassifiers -from torchTextClassifiers.categorical_value_encoder import CategoricalValueEncoder, DictEncoder from torchTextClassifiers.dataset import TextClassificationDataset from torchTextClassifiers.model import TextClassificationModel, TextClassificationModule from torchTextClassifiers.model.components import ( @@ -16,6 +15,7 @@ TextEmbedderConfig, ) from torchTextClassifiers.tokenizers import NGramTokenizer +from torchTextClassifiers.value_encoder import DictEncoder, ValueEncoder try: from torchTextClassifiers.tokenizers import HuggingFaceTokenizer, WordPieceTokenizer @@ -88,15 +88,16 @@ def run_full_pipeline( ) for i in range(n_features) } - cat_encoder = CategoricalValueEncoder(encoders) - encoded_categorical = cat_encoder.transform(categorical_data) - vocab_sizes = cat_encoder.vocabulary_sizes # --- Encode string labels to contiguous integers --- label_encoder = LabelEncoder() - encoded_labels = label_encoder.fit_transform(labels) + label_encoder.fit(labels) num_classes = len(label_encoder.classes_) + value_encoder = ValueEncoder(label_encoder, encoders) + encoded_categorical = value_encoder.transform(categorical_data) + vocab_sizes = value_encoder.vocabulary_sizes + # --- Direct component test: dataset with already-encoded integers --- dataset = TextClassificationDataset( texts=sample_text_data, @@ -180,7 +181,7 @@ def run_full_pipeline( # --- Wrapper pipeline with string categorical data --- # X keeps categorical columns as raw strings; the wrapper encoder handles them. X = np.column_stack([sample_text_data, categorical_data]) - Y = encoded_labels # integer-encoded labels (from LabelEncoder) + Y = labels # raw string labels (encoded by value_encoder) # Create model config (vocab sizes and num_classes come from the encoders) model_config = ModelConfig( @@ -198,11 +199,11 @@ def run_full_pipeline( num_epochs=1, ) - # Create classifier — pass the fitted categorical encoder + # Create classifier — pass the fitted value encoder ttc = torchTextClassifiers( tokenizer=tokenizer, model_config=model_config, - categorical_encoder=cat_encoder, + value_encoder=value_encoder, ) # Train with raw string categorical data From 199627862d6c2fe123c2bf528eaaf4537886fba2 Mon Sep 17 00:00:00 2001 From: meilame-tayebjee Date: Thu, 16 Apr 2026 09:53:38 +0000 Subject: [PATCH 7/7] docs: include label attention and value encoder --- README.md | 7 +- docs/source/architecture/overview.md | 121 +++- docs/source/tutorials/explainability.md | 868 ++++++++++-------------- docs/source/tutorials/mixed_features.md | 172 ++++- torchTextClassifiers/test copy.py | 107 +++ torchTextClassifiers/test.py | 95 +++ 6 files changed, 800 insertions(+), 570 deletions(-) create mode 100644 torchTextClassifiers/test copy.py create mode 100644 torchTextClassifiers/test.py diff --git a/README.md b/README.md index 55ff2ec..91187a0 100644 --- a/README.md +++ b/README.md @@ -7,6 +7,7 @@ A unified, extensible framework for text classification with categorical variabl ## 🚀 Features - **Complex input support**: Handle text data alongside categorical variables seamlessly. + - **ValueEncoder**: Pass raw string categorical values and labels directly — no manual integer encoding required. Build a `ValueEncoder` from `DictEncoder` or sklearn `LabelEncoder` instances once, and the wrapper handles encoding at train time and label decoding after prediction automatically. - **Unified yet highly customizable**: - Use any tokenizer from HuggingFace or the original fastText's ngram tokenizer. - Manipulate the components (`TextEmbedder`, `CategoricalVariableNet`, `ClassificationHead`) to easily create custom architectures - including **self-attention**. All of them are `torch.nn.Module` ! @@ -15,7 +16,9 @@ A unified, extensible framework for text classification with categorical variabl - **PyTorch Lightning**: Automated training with callbacks, early stopping, and logging - **Easy experimentation**: Simple API for training, evaluating, and predicting with minimal code: - The `torchTextClassifiers` wrapper class orchestrates the tokenizer and the model for you -- **Additional features**: explainability using Captum +- **Explainability**: + - **Captum integration**: gradient-based token attribution via integrated gradients (`explain_with_captum=True`). + - **Label attention**: class-specific cross-attention that produces one sentence embedding per class, enabling token-level explanations for each label (`explain_with_label_attention=True`). Enable it by setting `n_heads_label_attention` in `ModelConfig`. ## 📦 Installation @@ -57,5 +60,3 @@ See the [examples/](examples/) directory for: ## 📄 License This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details. - - diff --git a/docs/source/architecture/overview.md b/docs/source/architecture/overview.md index 06b4467..3de20a6 100644 --- a/docs/source/architecture/overview.md +++ b/docs/source/architecture/overview.md @@ -11,11 +11,80 @@ At its core, torchTextClassifiers processes data through a simple pipeline: ``` **Data Flow:** -1. **Text** is tokenized into numerical tokens -2. **Tokens** are embedded into dense vectors (with optional attention) -3. **Categorical variables** (optional) are embedded separately -4. **All embeddings** are combined -5. **Classification head** produces final predictions +1. **ValueEncoder** (optional) converts raw string categorical values and labels into integers +2. **Text** is tokenized into numerical tokens +3. **Tokens** are embedded into dense vectors (with optional self-attention) + — or into one embedding *per class* if **label attention** is enabled +4. **Categorical variables** (optional) are embedded separately +5. **All embeddings** are combined +6. **Classification head** produces final predictions + — if a `ValueEncoder` was provided, integer predictions are decoded back to original labels + +## Component 0: ValueEncoder (optional preprocessing) + +**Purpose:** Encode raw string (or mixed-type) categorical values and labels into +integer indices, and decode predicted integers back to original label values after +inference. + +### When to Use + +Use `ValueEncoder` whenever your categorical features or labels are stored as strings +(e.g. `"Electronics"`, `"positive"`) rather than integers. Without it, you must +integer-encode inputs manually before passing them to `train` / `predict`. + +### Building a ValueEncoder + +```python +from sklearn.preprocessing import LabelEncoder +from torchTextClassifiers.value_encoder import DictEncoder, ValueEncoder + +# Option A: sklearn LabelEncoder (fit on train data) +cat_encoder = LabelEncoder().fit(X_train_categories) + +# Option B: explicit dict mapping +cat_encoder = DictEncoder({"Electronics": 0, "Audio": 1, "Books": 2}) + +value_encoder = ValueEncoder( + label_encoder=LabelEncoder().fit(y_train), # encodes/decodes labels + categorical_encoders={ + "category": cat_encoder, # one entry per categorical column + # "brand": brand_encoder, # add more as needed + }, +) +``` + +### What It Provides + +```python +value_encoder.vocabulary_sizes # [3, ...] – inferred from each encoder +value_encoder.num_classes # 2 – inferred from label encoder +``` + +These are read automatically by `torchTextClassifiers` when constructing the model, +so you don't need to set `num_classes` or `categorical_vocabulary_sizes` in `ModelConfig` +manually. + +### Integration with the Wrapper + +```python +classifier = torchTextClassifiers( + tokenizer=tokenizer, + model_config=ModelConfig(embedding_dim=64), # num_classes inferred from encoder + value_encoder=value_encoder, +) + +# Train with raw string inputs (default: raw_categorical_inputs=True, raw_labels=True) +classifier.train(X_train, y_train, training_config) + +# Predict — output labels are decoded back to original strings automatically +result = classifier.predict(X_test) +print(result["prediction"]) # ["positive", "negative", ...] +``` + +The `ValueEncoder` is saved and reloaded with the model via `classifier.save()` / +`torchTextClassifiers.load()`. + +--- ## Component 1: Tokenizer @@ -144,6 +213,39 @@ embedder = TextEmbedder(config) - `n_head`: Number of attention heads (typically 4, 8, or 16) - `n_layer`: Depth of transformer (start with 2-3) +### With Label Attention (Optional Explainability Layer) + +Label attention replaces mean-pooling with a **cross-attention mechanism** where each +class has a learnable embedding that attends over the token sequence: + +``` +Token embeddings (batch, seq_len, d) + ↓ cross-attention (labels as queries, tokens as keys/values) +Sentence embeddings (batch, num_classes, d) ← one per class + ↓ +ClassificationHead (d → 1) ← shared, applied per class + ↓ +Logits (batch, num_classes) +``` + +Enable it by setting `n_heads_label_attention` in `ModelConfig`: + +```python +model_config = ModelConfig( + embedding_dim=96, + num_classes=6, + n_heads_label_attention=4, # number of attention heads for label attention +) +``` + +**Benefits:** +- Free explainability at inference time (`explain_with_label_attention=True` in `predict`) +- The returned attention matrix `(batch, n_head, num_classes, seq_len)` shows which + tokens each class focuses on +- Can be combined with self-attention (`attention_config`) + +**Constraint:** `embedding_dim` must be divisible by `n_heads_label_attention`. + ## Component 3: Categorical Variable Handler **Purpose:** Process categorical features (like user demographics, product categories) alongside text. @@ -276,7 +378,7 @@ head = ClassificationHead(net=custom_head) ## Complete Architecture ```{thumbnail} diagrams/NN.drawio.png -:alt: +:alt: ``` ### Full Model Assembly @@ -592,9 +694,10 @@ categorical_embedding_dim = min(50, 10 // 2) = 5 torchTextClassifiers provides a **component-based pipeline** for text classification: +0. **ValueEncoder** (optional) → Encodes raw string inputs; decodes predictions back to original labels 1. **Tokenizer** → Converts text to tokens -2. **Text Embedder** → Creates semantic embeddings (with optional attention) -3. **Categorical Handler** → Processes additional features (optional) +2. **Text Embedder** → Creates semantic embeddings (with optional self-attention and/or label attention) +3. **Categorical Handler** (optional) → Processes additional categorical features 4. **Classification Head** → Produces predictions **Key Benefits:** @@ -610,5 +713,3 @@ torchTextClassifiers provides a **component-based pipeline** for text classifica - **Examples**: Explore complete examples in the repository Ready to build your classifier? Start with {doc}`../getting_started/quickstart`! - - diff --git a/docs/source/tutorials/explainability.md b/docs/source/tutorials/explainability.md index 55c8c06..24b5910 100644 --- a/docs/source/tutorials/explainability.md +++ b/docs/source/tutorials/explainability.md @@ -1,525 +1,343 @@ -# Model Explainability - -Understand which words and characters drive your model's predictions using attribution analysis. - -## Learning Objectives - -By the end of this tutorial, you'll be able to: - -- Generate explanations for individual predictions -- Visualize word-level and character-level contributions -- Identify the most influential tokens -- Use interactive explainability for debugging -- Understand Captum integration for attribution analysis - -## Prerequisites - -- Completed {doc}`basic_classification` tutorial -- Familiarity with model predictions -- (Optional) Understanding of gradient-based attribution methods - -## What Is Explainability? - -**Model explainability** reveals which parts of the input contribute most to a prediction. For text classification: - -- **Word-level**: Which words influence the prediction? -- **Character-level**: Which characters matter most? -- **Attribution scores**: How much each token contributes (positive or negative) - -### Why Use Explainability? - -✅ **Debugging**: Identify if model focuses on correct features -✅ **Trust**: Understand and validate model decisions -✅ **Bias detection**: Discover unwanted correlations -✅ **Feature engineering**: Guide feature selection - -## Complete Example - -```python -import numpy as np -from torchTextClassifiers import ModelConfig, TrainingConfig, torchTextClassifiers -from torchTextClassifiers.tokenizers import WordPieceTokenizer - -# Training data -X_train = np.array([ - "I love this product", - "Great quality and excellent service", - "Amazing design and fantastic performance", - "This is terrible quality", - "Poor design and cheap materials", - "Awful experience with this product" -]) - -y_train = np.array([1, 1, 1, 0, 0, 0]) # 1 = Positive, 0 = Negative - -X_val = np.array([ - "Good product with decent quality", - "Bad quality and poor service" -]) -y_val = np.array([1, 0]) - -# Create and train tokenizer -tokenizer = WordPieceTokenizer(vocab_size=5000) -tokenizer.train(X_train.tolist()) - -# Create model -model_config = ModelConfig( - embedding_dim=50, - num_classes=2 -) - -classifier = torchTextClassifiers( - tokenizer=tokenizer, - model_config=model_config -) - -# Train -training_config = TrainingConfig( - num_epochs=25, - batch_size=8, - lr=1e-3 -) - -classifier.train( - X_train, y_train, X_val, y_val, - training_config=training_config -) - -# Test with explainability -test_text = "This product is amazing!" - -result = classifier.predict( - np.array([test_text]), - explain=True # Enable explainability -) - -# Extract results -prediction = result["prediction"][0][0].item() -confidence = result["confidence"][0][0].item() -attributions = result["attributions"][0][0] # Token-level attributions - -print(f"Prediction: {'Positive' if prediction == 1 else 'Negative'}") -print(f"Confidence: {confidence:.4f}") -print(f"Attribution shape: {attributions.shape}") -``` - -## Step-by-Step Walkthrough - -### 1. Enable Explainability - -Add `explain=True` to `predict()`: - -```python -result = classifier.predict( - X_test, - explain=True # Generate attribution scores -) -``` - -### 2. Understanding the Output - -The result dictionary contains additional keys: - -```python -{ - "prediction": tensor, # Class predictions - "confidence": tensor, # Confidence scores - "attributions": tensor, # Token-level attribution scores - "offset_mapping": list, # Character positions of tokens - "word_ids": list # Word IDs for each token -} -``` - -**Attributions shape:** `(batch_size, top_k, sequence_length)` -- Higher values = stronger influence on prediction -- Positive values = supports predicted class -- Negative values = opposes predicted class - -### 3. Visualize Word Contributions - -Map token attributions to words: - -```python -from torchTextClassifiers.utilities.plot_explainability import map_attributions_to_word - -# Get attribution data -attributions = result["attributions"][0][0] # Shape: (seq_len,) -word_ids = result["word_ids"][0] # List of word IDs - -# Map to words -words = test_text.split() -word_attributions = [] - -for word_idx in range(len(words)): - # Find tokens belonging to this word - token_mask = [wid == word_idx for wid in word_ids] - token_attrs = attributions[token_mask] - - if len(token_attrs) > 0: - word_attr = token_attrs.mean().item() - word_attributions.append((words[word_idx], word_attr)) - -# Display results -print("\nWord-Level Contributions:") -print("-" * 50) -for word, score in word_attributions: - print(f"{word:>15} | {'█' * int(score * 40)} {score:.4f}") -``` - -### 4. Character-Level Visualization - -For finer-grained analysis: - -```python -from torchTextClassifiers.utilities.plot_explainability import map_attributions_to_char - -# Map token attributions to characters -char_attributions = map_attributions_to_char( - attributions.unsqueeze(0), # Add batch dimension - result["offset_mapping"][0], - test_text -)[0] - -# Visualize -print("\nCharacter-Level Contributions:") -for i, char in enumerate(test_text): - if i < len(char_attributions): - score = char_attributions[i] - bar = "█" * int(score * 20) - print(f"{char} | {bar} {score:.4f}") -``` - -## Complete Visualization Example - -Here's a complete function to visualize word importance: - -```python -def explain_prediction(classifier, text): - """Generate and visualize explanations for a prediction.""" - import numpy as np - - # Get prediction with explainability - result = classifier.predict( - np.array([text]), - top_k=1, - explain=True - ) - - # Extract prediction info - prediction = result["prediction"][0][0].item() - confidence = result["confidence"][0][0].item() - sentiment = "Positive" if prediction == 1 else "Negative" - - print(f"Text: '{text}'") - print(f"Prediction: {sentiment} (confidence: {confidence:.4f})") - print("\n" + "="*60) - - # Get attributions - attributions = result["attributions"][0][0] - offset_mapping = result["offset_mapping"][0] - - # Map to characters - from torchTextClassifiers.utilities.plot_explainability import map_attributions_to_char - char_attrs = map_attributions_to_char( - attributions.unsqueeze(0), - offset_mapping, - text - )[0] - - # Group by words - words = text.split() - char_idx = 0 - word_scores = [] - - for word in words: - word_len = len(word) - word_attrs = char_attrs[char_idx:char_idx + word_len] - - if len(word_attrs) > 0: - avg_attr = sum(word_attrs) / len(word_attrs) - word_scores.append((word, avg_attr)) - - char_idx += word_len + 1 # +1 for space - - # Visualize - max_score = max(score for _, score in word_scores) if word_scores else 1 - - print("Word Contributions:") - print("-" * 60) - for word, score in word_scores: - bar_length = int((score / max_score) * 40) - bar = "█" * bar_length - print(f"{word:>15} | {bar:<40} {score:.4f}") - - # Show top contributor - if word_scores: - top_word, top_score = max(word_scores, key=lambda x: x[1]) - print("-" * 60) - print(f"Most influential: '{top_word}' (score: {top_score:.4f})") - -# Use it -explain_prediction(classifier, "This product is amazing!") -explain_prediction(classifier, "Poor quality and terrible service") -``` - -## Interactive Explainability - -Create an interactive tool for exploring predictions: - -```python -def interactive_explainability(classifier): - """Interactive mode for exploring model predictions.""" - print("\n" + "="*60) - print("Interactive Explainability Mode") - print("="*60) - print("Enter text to see predictions and explanations!") - print("Type 'quit' to exit.\n") - - while True: - user_text = input("Enter text: ").strip() - - if user_text.lower() in ['quit', 'exit', 'q']: - print("Goodbye!") - break - - if not user_text: - print("Please enter some text.") - continue - - try: - explain_prediction(classifier, user_text) - print("\n" + "-"*60 + "\n") - except Exception as e: - print(f"Error: {e}") - -# Use it -interactive_explainability(classifier) -``` - -## Understanding Attribution Scores - -### What Do Scores Mean? - -- **High positive scores**: Strong support for predicted class -- **Low/negative scores**: Opposition to predicted class -- **Zero scores**: Neutral contribution - -### Example Interpretation - -For positive sentiment prediction: - -``` -Word Contributions: ------------------------------------------------------------- - This | █████ 0.1234 - product | ████████████████ 0.4567 - is | ██ 0.0543 - amazing | ██████████████████████████████ 0.8901 - ! | ███ 0.0876 ------------------------------------------------------------- -Most influential: 'amazing' (score: 0.8901) -``` - -**Interpretation:** -- "amazing" strongly indicates positive sentiment (0.89) -- "product" moderately supports positive (0.46) -- "is" is nearly neutral (0.05) - -## Debugging with Explainability - -### Case 1: Unexpected Predictions - -```python -test_text = "This product is not good" -explain_prediction(classifier, test_text) - -# Output might show: -# Word Contributions: -# not | ████ 0.12 <- Low attribution! -# good | ██████████ 0.45 <- High attribution for "good" -``` - -**Problem**: Model ignores "not", focuses on "good" -**Solution**: Add more negation examples to training data - -### Case 2: Correct Predictions, Wrong Reasons - -```python -test_text = "Product from China is excellent" -explain_prediction(classifier, test_text) - -# If "China" has high attribution, model may have learned spurious correlation -``` - -**Problem**: Model uses irrelevant features -**Solution**: Audit training data for bias, balance dataset - -### Case 3: Low Confidence - -```python -test_text = "Product arrived on time" -result = classifier.predict(np.array([test_text]), explain=True) -confidence = result["confidence"][0][0].item() # Low confidence - -explain_prediction(classifier, test_text) -# All words have similar low attribution scores -``` - -**Interpretation**: Text doesn't contain strong sentiment indicators -**This is correct behavior**: Model appropriately uncertain - -## Advanced: Custom Attribution Methods - -By default, torchTextClassifiers uses integrated gradients. For custom attribution: - -```python -from torchTextClassifiers.utilities.plot_explainability import generate_attributions -from captum.attr import LayerIntegratedGradients - -# Access the underlying model -model = classifier.model - -# Create custom attribution method -attribution_method = LayerIntegratedGradients( - model, - model.text_embedder.embedding -) - -# Generate attributions -attributions = generate_attributions( - classifier, - texts=["Your text here"], - attribution_method=attribution_method -) -``` - -## Common Issues - -### Issue 1: Explainability Fails - -**Error:** "explain=True requires captum package" - -**Solution:** Install explainability dependencies: -```bash -uv sync --extra explainability -``` - -### Issue 2: All Attributions Near Zero - -**Possible causes:** -- Model not well-trained -- Text contains no discriminative features -- Attribution method sensitivity - -**Try:** -- Train longer or with more data -- Check prediction confidence -- Verify model performance on test set - -### Issue 3: Inconsistent Attributions - -**Problem:** Same word has different attributions in different contexts - -**This is expected!** Attribution considers: -- Surrounding context -- Position in sentence -- Interaction with other words - -## Best Practices - -1. **Always check confidence:** Low confidence = less reliable attributions -2. **Compare multiple examples:** Look for patterns across predictions -3. **Validate with domain knowledge:** Do highlighted words make sense? -4. **Use for debugging, not blind trust:** Attributions are approximations -5. **Check training data:** High attribution may reveal training biases - -## Real-World Use Cases - -### Sentiment Analysis - -```python -positive_review = "Excellent product with amazing quality" -negative_review = "Terrible product with poor quality" - -for review in [positive_review, negative_review]: - explain_prediction(classifier, review) - print("\n" + "="*60 + "\n") -``` - -Verify that sentiment words ("excellent", "terrible") have highest attribution. - -### Spam Detection - -```python -spam_text = "Click here for free money now!" -explain_prediction(spam_classifier, spam_text) -``` - -Check if "free", "click", "now" are highlighted (common spam indicators). - -### Topic Classification - -```python -sports_text = "The team won the championship game" -explain_prediction(topic_classifier, sports_text) -``` - -Verify "team", "championship", "game" drive sports prediction. - -## Customization - -### Batch Explainability - -Explain multiple texts at once: - -```python -test_texts = [ - "Great product", - "Terrible experience", - "Average quality" -] - -result = classifier.predict( - np.array(test_texts), - explain=True -) - -for i, text in enumerate(test_texts): - print(f"\nText {i+1}: {text}") - attributions = result["attributions"][i][0] - print(f"Max attribution: {attributions.max():.4f}") -``` - -### Save Explanations - -Export attributions for analysis: - -```python -import json - -explanations = [] -for text in test_texts: - result = classifier.predict(np.array([text]), explain=True) - - explanations.append({ - "text": text, - "prediction": int(result["prediction"][0][0].item()), - "confidence": float(result["confidence"][0][0].item()), - "attributions": result["attributions"][0][0].tolist() - }) - -# Save to JSON -with open("explanations.json", "w") as f: - json.dump(explanations, f, indent=2) -``` - -## Summary - -**Key takeaways:** -- Use `explain=True` to generate attribution scores -- Visualize word and character contributions -- High attribution = strong influence on prediction -- Use explainability for debugging and validation -- Check if model focuses on correct features - -Ready for multilabel classification? Continue to {doc}`multilabel_classification`! +# Model Explainability + +Understand which words drive your model's predictions using two complementary methods: +**Captum** (gradient-based attribution) and **label attention** (class-specific cross-attention). + +## Learning Objectives + +By the end of this tutorial, you'll be able to: + +- Generate token-level attribution scores with Captum +- Use label attention to see which tokens influence each class +- Visualize word-level contributions +- Choose the right explainability method for your use case + +## Prerequisites + +- Completed {doc}`basic_classification` tutorial +- (Optional) Understanding of gradient-based attribution methods + +## What Is Explainability? + +**Model explainability** reveals which parts of the input contribute most to a prediction. For text classification: + +- **Word-level**: Which words influence the prediction? +- **Character-level**: Which characters matter most? +- **Attribution scores**: How much each token contributes (positive or negative) + +### Why Use Explainability? + +✅ **Debugging**: Identify if model focuses on correct features +✅ **Trust**: Understand and validate model decisions +✅ **Bias detection**: Discover unwanted correlations +✅ **Feature engineering**: Guide feature selection + +--- + +## Method 1: Captum (Integrated Gradients) + +Captum computes gradient-based token attributions, measuring how much each token +contributes to the final prediction score. + +### Setup + +Install the optional explainability dependencies: + +```bash +uv sync --extra explainability +# or +pip install torchTextClassifiers[explainability] +``` + +### Quick Example + +```python +import numpy as np +from torchTextClassifiers import ModelConfig, TrainingConfig, torchTextClassifiers +from torchTextClassifiers.tokenizers import WordPieceTokenizer + +# Training data +X_train = np.array([ + "I love this product", + "Great quality and excellent service", + "Amazing design and fantastic performance", + "This is terrible quality", + "Poor design and cheap materials", + "Awful experience with this product" +]) +y_train = np.array([1, 1, 1, 0, 0, 0]) + +tokenizer = WordPieceTokenizer(vocab_size=5000) +tokenizer.train(X_train.tolist()) + +model_config = ModelConfig(embedding_dim=50, num_classes=2) +classifier = torchTextClassifiers(tokenizer=tokenizer, model_config=model_config) + +training_config = TrainingConfig(num_epochs=25, batch_size=8, lr=1e-3, + raw_categorical_inputs=False, raw_labels=False) +classifier.train(X_train, y_train, training_config=training_config) + +# Predict with Captum explainability +result = classifier.predict( + np.array(["This product is amazing!"]), + explain_with_captum=True, # <-- enable Captum attribution +) + +prediction = result["prediction"][0][0].item() +confidence = result["confidence"][0][0].item() +attributions = result["captum_attributions"][0][0] # shape: (seq_len,) + +print(f"Prediction: {'Positive' if prediction == 1 else 'Negative'}") +print(f"Confidence: {confidence:.4f}") +print(f"Attribution shape: {attributions.shape}") +``` + +### Output Dictionary + +When `explain_with_captum=True`, the result contains additional keys: + +```python +{ + "prediction": tensor, # class predictions (decoded if ValueEncoder present) + "confidence": tensor, # confidence scores + "captum_attributions": tensor, # shape (batch_size, top_k, seq_len) + "label_attention_attributions": None, + "offset_mapping": list, # character positions of each token + "word_ids": list, # word index for each token +} +``` + +**Attribution values:** +- Higher positive values → stronger support for the predicted class +- Negative values → oppose the predicted class +- Near zero → neutral contribution + +### Visualize Word Contributions + +```python +def explain_with_captum(classifier, text): + result = classifier.predict( + np.array([text]), + explain_with_captum=True + ) + + prediction = result["prediction"][0][0].item() + confidence = result["confidence"][0][0].item() + attributions = result["captum_attributions"][0][0] # (seq_len,) + offset_mapping = result["offset_mapping"][0] + + print(f"Text: '{text}'") + print(f"Prediction: {prediction} (confidence: {confidence:.4f})") + + # Map attributions to characters + char_attrs = [0.0] * len(text) + for (start, end), score in zip(offset_mapping, attributions.tolist()): + for i in range(start, end): + char_attrs[i] = score + + # Aggregate to words + words = text.split() + char_idx = 0 + print("\nWord Contributions:") + print("-" * 50) + for word in words: + scores = char_attrs[char_idx : char_idx + len(word)] + avg = sum(scores) / len(scores) if scores else 0.0 + bar = "█" * max(0, int(avg * 40)) + print(f"{word:>15} | {bar:<40} {avg:.4f}") + char_idx += len(word) + 1 # +1 for space + +explain_with_captum(classifier, "This product is amazing!") +``` + +--- + +## Method 2: Label Attention + +Label attention is a **built-in architectural feature** that produces one sentence +embedding per class via a learnable cross-attention mechanism. It is: + +- **Faster than Captum** at inference time (no gradient computation) +- **Class-specific**: shows which tokens matter for *each individual class* +- Enabled at model construction time via `n_heads_label_attention` in `ModelConfig` + +### Enable Label Attention + +```python +from torchTextClassifiers import ModelConfig, TrainingConfig, torchTextClassifiers +from torchTextClassifiers.model.components import AttentionConfig + +model_config = ModelConfig( + embedding_dim=96, + num_classes=6, + attention_config=AttentionConfig( # self-attention (optional but recommended) + n_layers=2, + n_head=4, + n_kv_head=4, + sequence_len=50, + ), + n_heads_label_attention=4, # <-- enables label attention with 4 heads +) + +classifier = torchTextClassifiers( + tokenizer=tokenizer, + model_config=model_config, +) +classifier.train(X_train, y_train, training_config=training_config) +``` + +### Predict with Label Attention + +```python +result = classifier.predict( + X_test, + explain_with_label_attention=True, # <-- request attention weights +) + +# Attention matrix: which tokens are important for each class +label_attention = result["label_attention_attributions"] +# Shape: (batch_size, n_head, num_classes, seq_len) +``` + +### Output Dictionary + +```python +{ + "prediction": tensor, # top-k class predictions + "confidence": tensor, # confidence scores + "captum_attributions": None, + "label_attention_attributions": tensor, # (batch_size, n_head, num_classes, seq_len) + "offset_mapping": list, + "word_ids": list, +} +``` + +### Inspect Per-Class Token Importance + +```python +import torch + +# Average across heads for readability +# label_attention: (batch_size, n_head, num_classes, seq_len) +per_class_scores = label_attention[0].mean(dim=0) # (num_classes, seq_len) + +tokens = tokenizer.tokenize([text]).input_ids[0] +class_names = ["World", "Sports", "Business", "Sci/Tech"] # example + +print("Token importance by class:") +for class_idx, class_name in enumerate(class_names): + scores = per_class_scores[class_idx] + top_token_idx = scores.argmax().item() + print(f" [{class_name}] most attended token index: {top_token_idx} " + f"(score: {scores[top_token_idx]:.4f})") +``` + +### Both Methods Together + +You can combine both explainability methods in a single `predict` call: + +```python +result = classifier.predict( + X_test, + explain_with_captum=True, + explain_with_label_attention=True, +) + +captum_attrs = result["captum_attributions"] # gradient-based +label_attrs = result["label_attention_attributions"] # attention-based +``` + +--- + +## Choosing Between Methods + +| | Captum | Label Attention | +|---|---|---| +| **Setup** | Requires `[explainability]` extra | Built into the model | +| **Speed** | Slower (gradient computation) | Fast (forward pass only) | +| **Granularity** | One attribution per token | One per (token, class) pair | +| **Works with any model** | Yes | Requires `n_heads_label_attention` set at training time | +| **Result key** | `captum_attributions` | `label_attention_attributions` | + +**Rule of thumb:** +- Use Captum for a single overall attribution score per token. +- Use label attention when you want to understand how each *class* attends to different + parts of the input (multi-class explainability). + +--- + +## Debugging with Explainability + +### Case 1: Model Ignores Negation + +```python +explain_with_captum(classifier, "This product is not good") +# If 'not' has low attribution and 'good' is high → model misses negation +# Solution: add more negation examples to training data +``` + +### Case 2: Spurious Correlations + +```python +explain_with_captum(classifier, "Product from Location X is excellent") +# If the location has high attribution → spurious correlation learned +# Solution: audit and balance the training set +``` + +### Case 3: Low Confidence + +```python +result = classifier.predict(np.array(["Product arrived on time"]), + explain_with_captum=True) +# Low confidence + low attribution scores = text has no strong class signal +# This is expected and correct model behaviour +``` + +--- + +## Common Issues + +### Issue 1: Captum Not Installed + +**Error:** `ImportError: Captum is not installed` + +**Solution:** +```bash +uv sync --extra explainability +``` + +### Issue 2: Label Attention Explainability Fails + +**Error:** `RuntimeError: Label attention explainability is enabled, but the model was not configured with label attention` + +**Solution:** Set `n_heads_label_attention` in `ModelConfig` **before training**: +```python +model_config = ModelConfig( + embedding_dim=96, + num_classes=4, + n_heads_label_attention=4, +) +``` +You cannot enable label attention on an already-trained model without retraining. + +### Issue 3: All Attributions Near Zero + +**Possible causes:** +- Model not well-trained +- Text has no discriminative features for that class + +**Try:** +- Train longer or with more data +- Check prediction confidence first + +--- + +## Summary + +**Key takeaways:** +- Use `explain_with_captum=True` for gradient-based token attributions +- Use `explain_with_label_attention=True` for class-specific attention weights (requires `n_heads_label_attention` set at model init) +- Both methods return `offset_mapping` and `word_ids` for mapping token scores back to words +- Result keys: `captum_attributions` and `label_attention_attributions` + +Ready for multilabel classification? Continue to {doc}`multilabel_classification`! diff --git a/docs/source/tutorials/mixed_features.md b/docs/source/tutorials/mixed_features.md index 163f3d7..6da09a1 100644 --- a/docs/source/tutorials/mixed_features.md +++ b/docs/source/tutorials/mixed_features.md @@ -7,6 +7,7 @@ Learn how to combine text with categorical variables for improved classification By the end of this tutorial, you'll be able to: - Combine text and categorical features in a single model +- Use `ValueEncoder` to handle raw string inputs without manual integer encoding - Configure categorical embeddings - Compare performance with and without categorical features - Understand when categorical features improve results @@ -38,7 +39,11 @@ These features can significantly improve classification when they contain releva - Random or high-cardinality features (e.g., user IDs) - Categorical features with no relationship to labels -## Complete Example +## Complete Example (with ValueEncoder — recommended) + +`ValueEncoder` lets you pass raw string values for both categorical features and labels. +No manual integer encoding is needed before training: the wrapper applies the encoders +automatically and decodes labels back to their original values after prediction. ```python import numpy as np @@ -47,8 +52,9 @@ from sklearn.model_selection import train_test_split from torchTextClassifiers import ModelConfig, TrainingConfig, torchTextClassifiers from torchTextClassifiers.tokenizers import WordPieceTokenizer +from torchTextClassifiers.value_encoder import DictEncoder, ValueEncoder -# Sample data: Product reviews with category +# Sample data: Product reviews with category (raw string values) texts = [ "Great phone with excellent camera", "Battery dies too quickly", @@ -60,62 +66,122 @@ texts = [ "Product arrived damaged" ] -# Categorical feature: Product category (0=Electronics, 1=Audio) -categories = [0, 0, 0, 0, 1, 1, 0, 0] +categories = ["Electronics", "Electronics", "Electronics", "Electronics", + "Audio", "Audio", "Electronics", "Electronics"] +labels = np.array(["positive", "negative", "positive", "negative", + "positive", "negative", "positive", "negative"]) + +# Combine text and raw categorical into one array +X = np.column_stack([texts, categories]) + +X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.25, random_state=42) + +# --- Build encoders (fit on train data only) --- +cat_encoder = LabelEncoder().fit(X_train[:, 1]) # for the category column +label_encoder = LabelEncoder().fit(y_train) # for labels + +value_encoder = ValueEncoder( + label_encoder=label_encoder, + categorical_encoders={"category": cat_encoder}, # one entry per categorical column +) + +# Create tokenizer +tokenizer = WordPieceTokenizer(vocab_size=1000) +tokenizer.train(X_train[:, 0].tolist()) + +# The ValueEncoder exposes vocabulary sizes and num_classes automatically +model_config = ModelConfig( + embedding_dim=64, + categorical_embedding_dims=[8], + # num_classes and categorical_vocabulary_sizes are inferred from the ValueEncoder +) + +classifier = torchTextClassifiers( + tokenizer=tokenizer, + model_config=model_config, + value_encoder=value_encoder, # <-- pass the encoder here +) + +training_config = TrainingConfig( + num_epochs=20, + batch_size=4, + lr=1e-3, + # raw_categorical_inputs=True (default) — the wrapper encodes for you + # raw_labels=True (default) — labels are encoded automatically +) + +classifier.train(X_train, y_train, training_config=training_config) + +# Predict — predictions are decoded back to original label strings +result = classifier.predict(X_test) +print(result["prediction"]) # e.g. ["positive", "negative", ...] +``` + +## Complete Example (manual encoding) + +If you prefer to handle integer encoding yourself, omit the `ValueEncoder` and pass +already-encoded arrays. In this case you must set `raw_categorical_inputs=False` and +`raw_labels=False` in `TrainingConfig` and in `predict`. + +```python +import numpy as np +from sklearn.preprocessing import LabelEncoder +from sklearn.model_selection import train_test_split + +from torchTextClassifiers import ModelConfig, TrainingConfig, torchTextClassifiers +from torchTextClassifiers.tokenizers import WordPieceTokenizer + +texts = [ + "Great phone with excellent camera", + "Battery dies too quickly", + "Love this laptop's performance", + "Screen quality is poor", + "Best headphones I've ever owned", + "Sound quality is disappointing", + "Fast shipping and great quality", + "Product arrived damaged" +] -# Labels: Positive (1) or Negative (0) +# Categorical feature already encoded as integers (0=Electronics, 1=Audio) +categories = [0, 0, 0, 0, 1, 1, 0, 0] labels = [1, 0, 1, 0, 1, 0, 1, 0] -# Prepare data X_text = np.array(texts) -X_categorical = np.array(categories).reshape(-1, 1) # Shape: (n_samples, 1) +X_categorical = np.array(categories).reshape(-1, 1) y = np.array(labels) -# Split data X_text_train, X_text_test, X_cat_train, X_cat_test, y_train, y_test = train_test_split( X_text, X_categorical, y, test_size=0.25, random_state=42 ) -# Create tokenizer tokenizer = WordPieceTokenizer(vocab_size=1000) tokenizer.train(X_text_train.tolist()) -# Configure model WITH categorical features model_config = ModelConfig( embedding_dim=64, num_classes=2, - categorical_vocabulary_sizes=[2], # 2 categories (Electronics, Audio) - categorical_embedding_dims=[8], # Embed each category into 8 dimensions + categorical_vocabulary_sizes=[2], + categorical_embedding_dims=[8], ) -# Create classifier -classifier = torchTextClassifiers( - tokenizer=tokenizer, - model_config=model_config -) +classifier = torchTextClassifiers(tokenizer=tokenizer, model_config=model_config) -# Training configuration training_config = TrainingConfig( num_epochs=20, batch_size=4, - lr=1e-3 + lr=1e-3, + raw_categorical_inputs=False, # inputs are already integer-encoded + raw_labels=False, # labels are already integer-encoded ) -# Combine text and categorical features X_train_mixed = np.column_stack([X_text_train, X_cat_train]) X_test_mixed = np.column_stack([X_text_test, X_cat_test]) -# Train model -classifier.train( - X_train_mixed, y_train, - training_config=training_config -) +classifier.train(X_train_mixed, y_train, training_config=training_config) -# Predict -result = classifier.predict(X_test_mixed) +result = classifier.predict(X_test_mixed, raw_categorical_inputs=False) predictions = result["prediction"].squeeze().numpy() -# Evaluate accuracy = (predictions == y_test).mean() print(f"Test Accuracy: {accuracy:.3f}") ``` @@ -124,12 +190,41 @@ print(f"Test Accuracy: {accuracy:.3f}") ### 1. Prepare Categorical Features -Categorical features must be **encoded as integers** (0, 1, 2, ...): +#### With ValueEncoder (recommended) + +`ValueEncoder` handles raw string inputs directly. +Build one encoder per categorical column (fit on training data only), then wrap them: + +```python +from sklearn.preprocessing import LabelEncoder +from torchTextClassifiers.value_encoder import DictEncoder, ValueEncoder + +# Example: one string column +cat_encoder = LabelEncoder().fit(X_train_categories) + +# Example: explicit mapping with DictEncoder +cat_encoder = DictEncoder({"Electronics": 0, "Audio": 1}) + +value_encoder = ValueEncoder( + label_encoder=LabelEncoder().fit(y_train), + categorical_encoders={"category": cat_encoder}, # key = feature name (any string) +) + +# Stack text + raw string categories — no integer conversion needed +X_train = np.column_stack([texts_train, categories_train]) # dtype=object is fine +``` + +The `ValueEncoder` also exposes `.vocabulary_sizes` and `.num_classes` so you don't +have to compute them manually for `ModelConfig`. + +#### Without ValueEncoder (manual encoding) + +If you prefer to manage encoding yourself, categorical features must be +**encoded as integers** (0, 1, 2, ...) before being passed to the model: ```python from sklearn.preprocessing import LabelEncoder -# Example: Encode product categories categories = ["Electronics", "Audio", "Electronics", "Audio"] encoder = LabelEncoder() categories_encoded = encoder.fit_transform(categories) @@ -353,12 +448,25 @@ X_categorical = categories.reshape(-1, 1) # Add column dimension ### Issue 2: Non-Integer Categories -**Error:** "Expected integer values" +**Error:** "Expected integer values" or "Raw categorical input encoding is enabled, but no value_encoder was provided" + +**Solution (recommended):** Use a `ValueEncoder` so the wrapper handles encoding automatically: +```python +from torchTextClassifiers.value_encoder import ValueEncoder +from sklearn.preprocessing import LabelEncoder + +value_encoder = ValueEncoder( + label_encoder=LabelEncoder().fit(y_train), + categorical_encoders={"category": LabelEncoder().fit(X_train_categories)}, +) +classifier = torchTextClassifiers(..., value_encoder=value_encoder) +``` -**Solution:** Use `LabelEncoder`: +**Alternative:** encode manually and set `raw_categorical_inputs=False`: ```python encoder = LabelEncoder() categories_encoded = encoder.fit_transform(categories) +training_config = TrainingConfig(..., raw_categorical_inputs=False, raw_labels=False) ``` ### Issue 3: Missing Vocabulary Sizes diff --git a/torchTextClassifiers/test copy.py b/torchTextClassifiers/test copy.py new file mode 100644 index 0000000..09dc5e6 --- /dev/null +++ b/torchTextClassifiers/test copy.py @@ -0,0 +1,107 @@ +import numpy as np +import torch + +from torchTextClassifiers import ModelConfig, TrainingConfig, torchTextClassifiers +from torchTextClassifiers.tokenizers import HuggingFaceTokenizer + +# Note: %load_ext autoreload and %autoreload 2 are specific to IPython/Notebooks +# and are omitted here for a standard Python script. + +# ========================================== +# 1. Ragged-lists approach +# ========================================== + +# In multilabel classification, each instance can be assigned multiple labels simultaneously. +# Let's use fake data where labels is a list of lists (ragged array). +sample_text_data = [ + "This is a positive example", + "This is a negative example", + "Another positive case", + "Another negative case", + "Good example here", + "Bad example here", +] + +# Each inner list contains labels for the corresponding instance +labels_ragged = [[0, 1, 5], [0, 4], [1, 5], [0, 1, 4], [1, 5], [0]] + +# Note: labels_ragged is a "jagged array." +# np.array(labels_ragged) would not work directly as a standard numeric matrix. +# However, torchTextClassifiers handles this directly. + +# Load a pre-trained tokenizer +tokenizer = HuggingFaceTokenizer.load_from_pretrained( + "google-bert/bert-base-uncased", output_dim=126 +) + +X = np.array(sample_text_data) +Y_ragged = labels_ragged + +# Configure the model and training +# We use BCEWithLogitsLoss for multilabel tasks to treat each label +# as a separate binary classification problem. +embedding_dim = 96 +num_classes = max(max(label_list) for label_list in labels_ragged) + 1 + +model_config = ModelConfig( + embedding_dim=embedding_dim, + num_classes=num_classes, +) + +training_config = TrainingConfig( + lr=1e-3, + batch_size=4, + num_epochs=1, + loss=torch.nn.BCEWithLogitsLoss(), # Essential for multilabel + raw_labels=False, + raw_categorical_inputs=False, +) + +# Initialize the classifier with ragged_multilabel=True +ttc_ragged = torchTextClassifiers( + tokenizer=tokenizer, + model_config=model_config, + ragged_multilabel=True, # Key for ragged list input! +) + +print("Starting training with ragged labels...") +ttc_ragged.train( + X_train=X, + y_train=Y_ragged, + training_config=training_config, +) + +# Behind the scenes, the ragged lists are converted into a binary matrix (one-hot version). + +# ========================================== +# 2. One-hot / multidimensional output approach +# ========================================== + +# You can also provide a one-hot/multidimensional array (or float probabilities). +# Here, each row is a vector of size equal to the number of labels. +labels_one_hot = [ + [1.0, 1.0, 0.0, 0.0, 0.0, 1.0], + [1.0, 0.0, 0.0, 0.0, 1.0, 0.0], + [0.0, 1.0, 0.0, 0.0, 0.0, 1.0], + [1.0, 1.0, 0.0, 0.0, 1.0, 0.0], + [0.0, 1.0, 0.0, 0.0, 0.0, 1.0], + [1.0, 0.0, 0.0, 0.0, 1.0, 0.0], +] +Y_one_hot = np.array(labels_one_hot) + +# When using one-hot/dense arrays, set ragged_multilabel=False (default) +ttc_dense = torchTextClassifiers( + tokenizer=tokenizer, + model_config=model_config, +) + +print("\nStarting training with one-hot labels...") +ttc_dense.train( + X_train=X, + y_train=Y_one_hot, + training_config=training_config, +) + +# Final Note: +# - Use BCEWithLogitsLoss for multilabel settings. +# - Use CrossEntropyLoss for "soft" multiclass (where probabilities sum to 1). diff --git a/torchTextClassifiers/test.py b/torchTextClassifiers/test.py new file mode 100644 index 0000000..2758b28 --- /dev/null +++ b/torchTextClassifiers/test.py @@ -0,0 +1,95 @@ +import numpy as np +import pandas as pd +from sklearn.preprocessing import LabelEncoder + +from torchTextClassifiers import ModelConfig, TrainingConfig, torchTextClassifiers +from torchTextClassifiers.tokenizers import WordPieceTokenizer +from torchTextClassifiers.value_encoder import DictEncoder, ValueEncoder + +sample_text_data = [ + "This is a positive example", + "This is a negative example", + "Another positive case", + "Another negative case", + "Good example here", + "Bad example here", +] + +categorical_data = np.array( + [ + ["cat", "red"], + ["dog", "blue"], + ["cat", "red"], + ["dog", "blue"], + ["cat", "red"], + ["dog", "blue"], + ] +) + +labels = np.array(["positive", "negative", "positive", "negative", "positive", "negative"]) + + +df = pd.DataFrame( + { + "text": sample_text_data, + "category": categorical_data[:, 0], + "color": categorical_data[:, 1], + "label": labels, + } +) +vocab_size = 10 +tokenizer = WordPieceTokenizer(vocab_size, output_dim=50) +tokenizer.train(sample_text_data) + +encoders = {} +# category : DictEncoder (ours) +feature = "category" +mapping = {val: idx for idx, val in enumerate(df[feature].unique())} +encoders[feature] = DictEncoder(mapping) + +# color: LabelEncoder (sklearn) +le = LabelEncoder() +le.fit(df["color"]) +encoders["color"] = le + +feature = "label" +le_label = LabelEncoder() +le_label.fit(df[feature]) +label_encoder = le_label + +# OR you can also use DictEncoder +# dict_mapping = {val: idx for idx, val in enumerate(df[feature].unique())} +# label_encoder = DictEncoder(dict_mapping) + +value_encoder = ValueEncoder(label_encoder, encoders) + + +model_config = ModelConfig( + embedding_dim=10, + categorical_embedding_dims=[5, 5], +) +training_config = TrainingConfig( + num_epochs=1, + batch_size=2, + lr=1e-3, +) + +ttc = torchTextClassifiers( + tokenizer=tokenizer, + model_config=model_config, + value_encoder=value_encoder, +) + +ttc.train( + X_train=df[["text", "category", "color"]].values, + y_train=df["label"].values, + training_config=training_config, +) + +torchTextClassifiers.load("my_ttc/") + +ttc.predict( + X_test=df[["text", "category", "color"]].values, + raw_categorical_inputs=True, # Set to True since we're providing raw categorical values + top_k=2, +)