Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions supervised/automl.py
Original file line number Diff line number Diff line change
Expand Up @@ -444,7 +444,7 @@ def fit(
pass


def predict(self, X: Union[List, numpy.ndarray, pandas.DataFrame]) -> numpy.ndarray:
def predict(self, X: Union[List, numpy.ndarray, pandas.DataFrame], models = []) -> numpy.ndarray:
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think that comment with description how the functions is working should be here. Docs are generated based on this comment https://supervised.mljar.com/api/#supervised.automl.AutoML.predict

"""
Computes predictions from AutoML best model.

Expand All @@ -461,7 +461,7 @@ def predict(self, X: Union[List, numpy.ndarray, pandas.DataFrame]) -> numpy.ndar
Raises:
AutoMLException: Model has not yet been fitted.
"""
return self._predict(X)
return self._predict(X, models)

def predict_proba(
self, X: Union[List, numpy.ndarray, pandas.DataFrame]
Expand Down
118 changes: 108 additions & 10 deletions supervised/base_automl.py
Original file line number Diff line number Diff line change
Expand Up @@ -1209,6 +1209,9 @@ def _fit(self, X, y, sample_weight=None, cv=None, sensitive_features=None):

if not self._models:
raise AutoMLException("No models produced.")

#sorting models by lowest loss
self._models = sorted(self._models, key=lambda x: x.get_final_loss())
self._fit_level = "finished"
self.save_progress()
self.select_and_save_best(show_warnings=True)
Expand Down Expand Up @@ -1434,6 +1437,23 @@ def models_needed_on_predict(self, required_model_name):
+ [required_model_name]
)
)

def _do_prediction_union(self, X, model_list = []):
predictions = []

for i, model in enumerate(model_list):
prediction = self._base_predict(X, model)

if i > 0:
prediction = prediction.add_suffix(f"_{i}")

predictions.append(prediction)

df_union = pd.concat(predictions, axis=1)

return df_union



def _base_predict(self, X, model=None):
if model is None:
Expand Down Expand Up @@ -1504,16 +1524,94 @@ def _base_predict(self, X, model=None):
else:
return predictions

def _predict(self, X):
predictions = self._base_predict(X)
# Return predictions
# If classification task the result is in column 'label'
# If regression task the result is in column 'prediction'
return (
predictions["label"].to_numpy()
if self._ml_task != REGRESSION
else predictions["prediction"].to_numpy()
)
def _predict(self, X, models=[]):
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It would be good to add tests for predict. It would be good if we have following test cases:

  • no model selected, predict should use the best model,
  • models selected and predict should provide predictions for each model,
  • automl trained, and then loaded back from hard drive, and here two cases: (1) prediction computed on best model, (2) and predictions computed on selected models

If we are going to provide such functionality to predict then we should at it to predict_proba, predict_all as well.

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure. For the predict_all method, does it make sense for it to just call the new predict implementation? since both of them originally just called the base predict method.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, it makes sense.

"""
Generates predictions using one or multiple models based on the selected models.

Parameters
----------
X : array-like, pandas.DataFrame
Input data to generate predictions for.

models : list, default=[]
List of model names to be used.
Raises an exception if any provided model name does not exist.
If no models are provided, use the best one.

Returns
-------
numpy.ndarray
- If a single model is selected: returns a 1D array of predictions.
- If multiple models are selected: returns a 2D array (n_samples x n_models),
containing predictions from each model side-by-side.

Raises
------
AutoMLException
- If no models were selected.
- If invalid prediction mode is provided.
- If custom models are missing or invalid.
"""

selected_models = []
n_models = len(models)

if n_models >= 1:
# Collect valid model names available in the system
available = [m.get_name() for m in self._models]

# Detect invalid names passed by the user
invalid = [name for name in models if name not in available]

# If any invalid custom model name is found → raise detailed error
if invalid:
raise AutoMLException(
f"The following custom models are not available: {invalid}\n"
f"Available models are: {available}"
)

# Select the models that match the requested names
filtered_models = [
m for m in self._models if m.get_name() in models
]

for model in filtered_models:
selected_models.append(model)
else:
selected_models.append(self._best_model)

print(f'Models being used for prediction: {[model.get_name() for model in selected_models]}')
# ------------------------------------------------------------------
# MULTI-MODEL PREDICTION (returns 2D array)
# ------------------------------------------------------------------
if len(selected_models) > 1:
# Perform the union of predictions for all selected models
predictions = self._do_prediction_union(X, selected_models)

# Select the correct output columns depending on the task
if self._ml_task != REGRESSION:
# Multi-class/binary classification → use "label" columns
cols = [c for c in predictions.columns if c.startswith("label")]
else:
# Regression → use "prediction" columns
cols = [c for c in predictions.columns if c.startswith("prediction")]

# Return predictions as a 2D numpy array (n_samples x n_models)
return predictions[cols].to_numpy()

# ------------------------------------------------------------------
# SINGLE-MODEL PREDICTION (returns 1D array)
# ------------------------------------------------------------------
elif len(selected_models) == 1:
predictions = self._base_predict(X, selected_models[0])

return (
predictions["label"].to_numpy()
if self._ml_task != REGRESSION
else predictions["prediction"].to_numpy()
)



def _predict_proba(self, X):
# Check is task type is correct
Expand Down