Skip to content

Commit 08dc4e3

Browse files
feat!(value_encoder): stabilize first implem for value_encoder
- take a step back, add label encoder vs categroical encoder - adapted checkers at training time - train is working, need to adapt predict - saving and loading is working - incidentally solving #77
1 parent 4cb8632 commit 08dc4e3

3 files changed

Lines changed: 160 additions & 77 deletions

File tree

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
1-
from .categorical_value_encoder import CategoricalValueEncoder as CategoricalValueEncoder
21
from .categorical_value_encoder import DictEncoder as DictEncoder
2+
from .categorical_value_encoder import ValueEncoder as ValueEncoder

torchTextClassifiers/categorical_value_encoder/categorical_value_encoder.py

Lines changed: 74 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
from typing import Optional
2+
13
import numpy as np
24
from sklearn.preprocessing import LabelEncoder
35

@@ -25,44 +27,71 @@ def _dict_map(dic, col):
2527
else:
2628
raise TypeError("Unsupported type for encoding: {}".format(type(col)))
2729

30+
@property
31+
def vocabulary_size(self):
32+
return len(self.mapping)
33+
2834

29-
class CategoricalValueEncoder:
35+
class ValueEncoder:
3036
"""
3137
An object to encode raw categorical values into numerical indices.
3238
33-
Initialized with pre-built DictEncoder or sklearn LabelEncoder instances,
34-
one per categorical feature.
35-
3639
Build encoders externally before passing them in:
3740
- DictEncoder: provide a ``{value: index}`` mapping directly.
3841
- sklearn LabelEncoder: call ``LabelEncoder().fit(column)`` per feature.
3942
4043
Initialization:
41-
- encoders: A dictionary mapping feature names to DictEncoder or LabelEncoder instances.
44+
- label_encoder: A DictEncoder or LabelEncoder instance for encoding labels.
45+
- encoders (optional): A dictionary mapping feature names to DictEncoder or LabelEncoder instances.
4246
4347
Properties:
4448
- vocabulary_sizes: List of vocabulary sizes (number of unique values) for each feature.
49+
- num_classes: Number of unique classes in the label encoder.
4550
4651
Usage:
4752
- transform(array): Encode a 2D array of shape (N, n_features) to integers.
4853
- __call__(array): Alias for transform.
4954
"""
5055

51-
def __init__(self, encoders: dict[str, DictEncoder | LabelEncoder]):
52-
self.encoders = encoders
56+
def __init__(
57+
self,
58+
label_encoder: DictEncoder | LabelEncoder,
59+
categorical_encoders: Optional[dict[str, DictEncoder | LabelEncoder]] = None,
60+
):
61+
self.categorical_encoders = categorical_encoders
62+
63+
if not isinstance(label_encoder, (DictEncoder, LabelEncoder)):
64+
raise TypeError(
65+
f"label_encoder must be a DictEncoder or LabelEncoder instance, got {type(label_encoder)}"
66+
)
67+
self.label_encoder = label_encoder
5368

5469
@property
5570
def vocabulary_sizes(self) -> list[int]:
5671
"""Number of unique categories per feature, in order."""
57-
sizes = []
58-
for enc in self.encoders.values():
59-
if isinstance(enc, DictEncoder):
60-
sizes.append(len(enc.mapping))
61-
elif hasattr(enc, "classes_"):
62-
sizes.append(len(enc.classes_))
63-
else:
64-
raise TypeError(f"Unsupported encoder type: {type(enc)}")
65-
return sizes
72+
73+
if self.categorical_encoders is None:
74+
return None
75+
else:
76+
sizes = []
77+
for enc in self.categorical_encoders.values():
78+
if isinstance(enc, DictEncoder):
79+
sizes.append(len(enc.mapping))
80+
elif hasattr(enc, "classes_"):
81+
sizes.append(len(enc.classes_))
82+
else:
83+
raise TypeError(f"Unsupported encoder type: {type(enc)}")
84+
return sizes
85+
86+
@property
87+
def num_classes(self) -> int:
88+
"""Number of unique classes in the label encoder, if provided."""
89+
if isinstance(self.label_encoder, DictEncoder):
90+
return len(self.label_encoder.mapping)
91+
elif hasattr(self.label_encoder, "classes_"):
92+
return len(self.label_encoder.classes_)
93+
else:
94+
raise TypeError(f"Unsupported label encoder type: {type(self.label_encoder)}")
6695

6796
def transform(self, X_categorical: np.ndarray) -> np.ndarray:
6897
"""Encode all categorical columns to integer indices.
@@ -78,11 +107,15 @@ def transform(self, X_categorical: np.ndarray) -> np.ndarray:
78107
Raises:
79108
ValueError: If any value was not seen during fitting.
80109
"""
110+
111+
if self.categorical_encoders is None:
112+
raise ValueError("No categorical encoders provided. Cannot transform data.")
113+
81114
if X_categorical.ndim == 1:
82115
X_categorical = X_categorical.reshape(-1, 1)
83116

84117
result = np.empty(X_categorical.shape, dtype=np.int64)
85-
for idx, (name, encoder) in enumerate(self.encoders.items()):
118+
for idx, (name, encoder) in enumerate(self.categorical_encoders.items()):
86119
col = X_categorical[:, idx].astype(str)
87120
encoded = encoder.transform(col)
88121
try:
@@ -96,5 +129,29 @@ def transform(self, X_categorical: np.ndarray) -> np.ndarray:
96129

97130
return result
98131

132+
def transform_labels(self, y_labels: np.ndarray) -> np.ndarray:
133+
"""Encode label array to integer indices.
134+
135+
Values are converted to strings before lookup. Unknown values raise a ValueError.
136+
137+
Args:
138+
y_labels: Array of shape (N,) with label values.
139+
Returns:
140+
Integer-encoded array of shape (N,), dtype int64.
141+
Raises:
142+
ValueError: If any label value was not seen during fitting.
143+
"""
144+
145+
col = y_labels.astype(str)
146+
encoded = self.label_encoder.transform(col)
147+
try:
148+
return encoded.astype(np.int64)
149+
except (TypeError, ValueError):
150+
unknown = [v for v, e in zip(col.tolist(), encoded.tolist()) if e is None]
151+
raise ValueError(
152+
f"Unknown values in label encoder: {unknown}. "
153+
"These values were not seen during fitting."
154+
)
155+
99156
def __call__(self, array: np.ndarray) -> np.ndarray:
100157
return self.transform(array)

0 commit comments

Comments
 (0)