1+ from typing import Optional
2+
13import numpy as np
24from sklearn .preprocessing import LabelEncoder
35
@@ -25,44 +27,71 @@ def _dict_map(dic, col):
2527 else :
2628 raise TypeError ("Unsupported type for encoding: {}" .format (type (col )))
2729
30+ @property
31+ def vocabulary_size (self ):
32+ return len (self .mapping )
33+
2834
29- class CategoricalValueEncoder :
35+ class ValueEncoder :
3036 """
3137 An object to encode raw categorical values into numerical indices.
3238
33- Initialized with pre-built DictEncoder or sklearn LabelEncoder instances,
34- one per categorical feature.
35-
3639 Build encoders externally before passing them in:
3740 - DictEncoder: provide a ``{value: index}`` mapping directly.
3841 - sklearn LabelEncoder: call ``LabelEncoder().fit(column)`` per feature.
3942
4043 Initialization:
41- - encoders: A dictionary mapping feature names to DictEncoder or LabelEncoder instances.
44+ - label_encoder: A DictEncoder or LabelEncoder instance for encoding labels.
45+ - encoders (optional): A dictionary mapping feature names to DictEncoder or LabelEncoder instances.
4246
4347 Properties:
4448 - vocabulary_sizes: List of vocabulary sizes (number of unique values) for each feature.
49+ - num_classes: Number of unique classes in the label encoder.
4550
4651 Usage:
4752 - transform(array): Encode a 2D array of shape (N, n_features) to integers.
4853 - __call__(array): Alias for transform.
4954 """
5055
51- def __init__ (self , encoders : dict [str , DictEncoder | LabelEncoder ]):
52- self .encoders = encoders
56+ def __init__ (
57+ self ,
58+ label_encoder : DictEncoder | LabelEncoder ,
59+ categorical_encoders : Optional [dict [str , DictEncoder | LabelEncoder ]] = None ,
60+ ):
61+ self .categorical_encoders = categorical_encoders
62+
63+ if not isinstance (label_encoder , (DictEncoder , LabelEncoder )):
64+ raise TypeError (
65+ f"label_encoder must be a DictEncoder or LabelEncoder instance, got { type (label_encoder )} "
66+ )
67+ self .label_encoder = label_encoder
5368
5469 @property
5570 def vocabulary_sizes (self ) -> list [int ]:
5671 """Number of unique categories per feature, in order."""
57- sizes = []
58- for enc in self .encoders .values ():
59- if isinstance (enc , DictEncoder ):
60- sizes .append (len (enc .mapping ))
61- elif hasattr (enc , "classes_" ):
62- sizes .append (len (enc .classes_ ))
63- else :
64- raise TypeError (f"Unsupported encoder type: { type (enc )} " )
65- return sizes
72+
73+ if self .categorical_encoders is None :
74+ return None
75+ else :
76+ sizes = []
77+ for enc in self .categorical_encoders .values ():
78+ if isinstance (enc , DictEncoder ):
79+ sizes .append (len (enc .mapping ))
80+ elif hasattr (enc , "classes_" ):
81+ sizes .append (len (enc .classes_ ))
82+ else :
83+ raise TypeError (f"Unsupported encoder type: { type (enc )} " )
84+ return sizes
85+
86+ @property
87+ def num_classes (self ) -> int :
88+ """Number of unique classes in the label encoder, if provided."""
89+ if isinstance (self .label_encoder , DictEncoder ):
90+ return len (self .label_encoder .mapping )
91+ elif hasattr (self .label_encoder , "classes_" ):
92+ return len (self .label_encoder .classes_ )
93+ else :
94+ raise TypeError (f"Unsupported label encoder type: { type (self .label_encoder )} " )
6695
6796 def transform (self , X_categorical : np .ndarray ) -> np .ndarray :
6897 """Encode all categorical columns to integer indices.
@@ -78,11 +107,15 @@ def transform(self, X_categorical: np.ndarray) -> np.ndarray:
78107 Raises:
79108 ValueError: If any value was not seen during fitting.
80109 """
110+
111+ if self .categorical_encoders is None :
112+ raise ValueError ("No categorical encoders provided. Cannot transform data." )
113+
81114 if X_categorical .ndim == 1 :
82115 X_categorical = X_categorical .reshape (- 1 , 1 )
83116
84117 result = np .empty (X_categorical .shape , dtype = np .int64 )
85- for idx , (name , encoder ) in enumerate (self .encoders .items ()):
118+ for idx , (name , encoder ) in enumerate (self .categorical_encoders .items ()):
86119 col = X_categorical [:, idx ].astype (str )
87120 encoded = encoder .transform (col )
88121 try :
@@ -96,5 +129,29 @@ def transform(self, X_categorical: np.ndarray) -> np.ndarray:
96129
97130 return result
98131
132+ def transform_labels (self , y_labels : np .ndarray ) -> np .ndarray :
133+ """Encode label array to integer indices.
134+
135+ Values are converted to strings before lookup. Unknown values raise a ValueError.
136+
137+ Args:
138+ y_labels: Array of shape (N,) with label values.
139+ Returns:
140+ Integer-encoded array of shape (N,), dtype int64.
141+ Raises:
142+ ValueError: If any label value was not seen during fitting.
143+ """
144+
145+ col = y_labels .astype (str )
146+ encoded = self .label_encoder .transform (col )
147+ try :
148+ return encoded .astype (np .int64 )
149+ except (TypeError , ValueError ):
150+ unknown = [v for v , e in zip (col .tolist (), encoded .tolist ()) if e is None ]
151+ raise ValueError (
152+ f"Unknown values in label encoder: { unknown } . "
153+ "These values were not seen during fitting."
154+ )
155+
99156 def __call__ (self , array : np .ndarray ) -> np .ndarray :
100157 return self .transform (array )
0 commit comments