@@ -29,7 +29,7 @@ class ForwardFeatureSelection:
2929 more or less with the maximum number of steps in the forward feature
3030 selection.
3131 pos_only : bool
32- Whether or not the model coefficients should all be positive.
32+ Whether or not the model coefficients should all be positive (no sign flips) .
3333 self._fitted_models : list
3434 List of fitted models.
3535 """
@@ -76,8 +76,7 @@ def get_model_from_step(self, step: int):
7676
7777 def compute_model_performances (self , data : pd .DataFrame ,
7878 target_column_name : str ,
79- splits : list = ["train" , "selection" ,
80- "validation" ]
79+ splits : list = ["train" , "selection" , "validation" ]
8180 ) -> pd .DataFrame :
8281 """Compute for each model the performance for different sets (e.g.
8382 train-selection-validation) and return them along with a list of
@@ -111,7 +110,7 @@ def compute_model_performances(self, data: pd.DataFrame,
111110 "last_added_predictor" : list (last_added_predictor )[0 ]
112111 }
113112
114- # Evaluate model on each data set split,
113+ # Evaluate model on each dataset split,
115114 # e.g. train-selection-validation
116115 tmp .update ({
117116 f"{ split } _performance" : model .evaluate (
@@ -139,7 +138,11 @@ def fit(self, train_data: pd.DataFrame, target_column_name: str,
139138 Parameters
140139 ----------
141140 train_data : pd.DataFrame
142- Data on which to fit the model.
141+ Data on which to fit the model. Should include a "train"
142+ and "selection" split for correct model selection! The
143+ "train" split is used to train a model, the "selection"
144+ split is used to evaluate which model to include in the
145+ actual forward feature selection.
143146 target_column_name : str
144147 Name of the target column.
145148 predictors : list
@@ -155,6 +158,12 @@ def fit(self, train_data: pd.DataFrame, target_column_name: str,
155158 In case the number of forced predictors is larger than the maximum
156159 number of allowed predictors in the model.
157160 """
161+
162+ assert "split" in train_data .columns , "The train_data input df does not include a split column."
163+ print (train_data ["split" ].unique ())
164+ assert len (set (["train" , "selection" ]).difference (set (train_data ["split" ].unique ()))) == 0 , \
165+ "The train_data input df does not include a 'train' and 'selection' split."
166+
158167 # remove excluded predictors from predictor lists
159168 filtered_predictors = [var for var in predictors
160169 if (var not in excluded_predictors and
@@ -163,13 +172,13 @@ def fit(self, train_data: pd.DataFrame, target_column_name: str,
163172 # checks on predictor lists and self.max_predictors attr
164173 if len (forced_predictors ) > self .max_predictors :
165174 raise ValueError ("Size of forced_predictors cannot be bigger than "
166- "max_predictors" )
175+ "max_predictors. " )
167176 elif len (forced_predictors ) == self .max_predictors :
168177 log .info ("Size of forced_predictors equals max_predictors "
169178 "only one model will be trained..." )
170179 # train model with all forced_predictors (only)
171180 (self ._fitted_models
172- .append (self ._train_model (train_data ,
181+ .append (self ._train_model (train_data [ train_data [ "split" ] == "train" ] ,
173182 target_column_name ,
174183 forced_predictors )))
175184 else :
@@ -178,12 +187,14 @@ def fit(self, train_data: pd.DataFrame, target_column_name: str,
178187 filtered_predictors ,
179188 forced_predictors )
180189
181- def _forward_selection (self , train_data : pd .DataFrame ,
182- target_column_name : str , predictors : list ,
190+ def _forward_selection (self ,
191+ train_data : pd .DataFrame ,
192+ target_column_name : str ,
193+ predictors : list ,
183194 forced_predictors : list = []) -> list :
184195 """Perform the forward feature selection algorithm to compute a list
185196 of models (with increasing performance). The length of the list,
186- i.e. the number of models is bounded by the max_predictors class
197+ i.e. the number of models, is bounded by the max_predictors class
187198 attribute.
188199
189200 Parameters
@@ -208,10 +219,11 @@ def _forward_selection(self, train_data: pd.DataFrame,
208219
209220 max_steps = 1 + min (self .max_predictors ,
210221 len (predictors ) + len (forced_predictors ))
222+
211223 for step in tqdm (range (1 , max_steps ), desc = "Sequentially adding best "
212224 "predictor..." ):
213225 if step <= len (forced_predictors ):
214- # first, we go through forced predictors
226+ # first, we go through the forced predictors
215227 candidate_predictors = [var for var in forced_predictors
216228 if var not in current_predictors ]
217229 else :
@@ -230,13 +242,19 @@ def _forward_selection(self, train_data: pd.DataFrame,
230242 .union (set (model .predictors )))
231243
232244 fitted_models .append (model )
245+ # else:
246+ # # If model returns None for the first time,
247+ # # one can in theory stop the feature selection process
248+ # # but we leave it run such that tqdm cleanly finishes
249+ # break
233250
234251 if not fitted_models :
235- log .error ("No models found in forward selection" )
252+ log .error ("No models found in forward selection. " )
236253
237254 return fitted_models
238255
239- def _find_next_best_model (self , train_data : pd .DataFrame ,
256+ def _find_next_best_model (self ,
257+ train_data : pd .DataFrame ,
240258 target_column_name : str ,
241259 candidate_predictors : list ,
242260 current_predictors : list ):
@@ -272,15 +290,19 @@ def _find_next_best_model(self, train_data: pd.DataFrame,
272290 "for the given model_type specified as "
273291 "ForwardFeatureSelection argument." )
274292
293+ fit_data = train_data [train_data ["split" ] == "train" ] # data to fit the models with
294+ sel_data = train_data [train_data ["split" ] == "selection" ] # data to compare the models with
295+
275296 for pred in candidate_predictors :
276297 # Train a model with an additional predictor
277- model = self ._train_model (train_data , target_column_name ,
298+ model = self ._train_model (fit_data , target_column_name ,
278299 (current_predictors + [pred ]))
300+
279301 # Evaluate the model
280302 performance = (model
281- .evaluate (train_data [current_predictors + [pred ]],
282- train_data [target_column_name ],
283- split = "train " ))
303+ .evaluate (sel_data [current_predictors + [pred ]],
304+ sel_data [target_column_name ],
305+ split = "selection " ))
284306
285307 if self .pos_only and (not (model .get_coef () >= 0 ).all ()):
286308 continue
0 commit comments