diff --git a/flaml/automl/automl.py b/flaml/automl/automl.py index 1064fd29df..493045022b 100644 --- a/flaml/automl/automl.py +++ b/flaml/automl/automl.py @@ -343,6 +343,12 @@ def custom_metric( } ``` skip_transform: boolean, default=False | Whether to pre-process data prior to modeling. + allow_label_overlap: boolean, default=True | For classification tasks with holdout evaluation, + whether to allow label overlap between train and validation sets. When True (default), + uses a fast strategy that adds the first instance of missing labels to the set that is + missing them, which may create some overlap. When False, uses a precise but slower + strategy that intelligently re-splits instances to avoid overlap when possible. + Only affects classification tasks with holdout evaluation method. fit_kwargs_by_estimator: dict, default=None | The user specified keywords arguments, grouped by estimator name. e.g., @@ -373,6 +379,7 @@ def custom_metric( settings["split_ratio"] = settings.get("split_ratio", SPLIT_RATIO) settings["n_splits"] = settings.get("n_splits", N_SPLITS) settings["auto_augment"] = settings.get("auto_augment", True) + settings["allow_label_overlap"] = settings.get("allow_label_overlap", True) settings["metric"] = settings.get("metric", "auto") # Validate that custom metric is callable if not a string self._validate_metric_parameter(settings["metric"], allow_auto=True) @@ -1113,6 +1120,7 @@ def retrain_from_log( eval_method = self._decide_eval_method(eval_method, time_budget) self.modelcount = 0 self._auto_augment = auto_augment + self._allow_label_overlap = self._settings.get("allow_label_overlap", True) self._prepare_data(eval_method, split_ratio, n_splits) self._state.time_budget = -1 self._state.free_mem_ratio = 0 @@ -1716,6 +1724,7 @@ def _prepare_data(self, eval_method, split_ratio, n_splits): n_splits, self._df, self._sample_weight_full, + self._allow_label_overlap, ) self.data_size_full = self._state.data_size_full @@ -1772,6 +1781,7 @@ def fit( time_col=None, cv_score_agg_func=None, skip_transform=None, + allow_label_overlap=True, mlflow_logging=None, fit_kwargs_by_estimator=None, mlflow_exp_name=None, @@ -2058,6 +2068,12 @@ def cv_score_agg_func(val_loss_folds, log_metrics_folds): ``` skip_transform: boolean, default=False | Whether to pre-process data prior to modeling. + allow_label_overlap: boolean, default=True | For classification tasks with holdout evaluation, + whether to allow label overlap between train and validation sets. When True (default), + uses a fast strategy that adds the first instance of missing labels to the set that is + missing them, which may create some overlap. When False, uses a precise but slower + strategy that intelligently re-splits instances to avoid overlap when possible. + Only affects classification tasks with holdout evaluation method. mlflow_logging: boolean, default=None | Whether to log the training results to mlflow. Default value is None, which means the logging decision is made based on AutoML.__init__'s mlflow_logging argument. Not valid if mlflow is not installed. @@ -2127,6 +2143,9 @@ def cv_score_agg_func(val_loss_folds, log_metrics_folds): split_ratio = split_ratio or self._settings.get("split_ratio") n_splits = n_splits or self._settings.get("n_splits") auto_augment = self._settings.get("auto_augment") if auto_augment is None else auto_augment + allow_label_overlap = ( + self._settings.get("allow_label_overlap") if allow_label_overlap is None else allow_label_overlap + ) metric = self._settings.get("metric") if metric is None else metric estimator_list = estimator_list or self._settings.get("estimator_list") log_file_name = self._settings.get("log_file_name") if log_file_name is None else log_file_name @@ -2309,6 +2328,7 @@ def cv_score_agg_func(val_loss_folds, log_metrics_folds): self._retrain_in_budget = retrain_full == "budget" and (eval_method == "holdout" and self._state.X_val is None) self._auto_augment = auto_augment + self._allow_label_overlap = allow_label_overlap _sample_size_from_starting_points = {} if isinstance(starting_points, dict): diff --git a/flaml/automl/task/generic_task.py b/flaml/automl/task/generic_task.py index 5b74a3d755..19f80a45ef 100644 --- a/flaml/automl/task/generic_task.py +++ b/flaml/automl/task/generic_task.py @@ -365,6 +365,465 @@ def _train_test_split(state, X, y, first=None, rest=None, split_ratio=0.2, strat X_train, X_val, y_train, y_val = GenericTask._split_pyspark(state, X, y, split_ratio, stratify) return X_train, X_val, y_train, y_val + def _handle_missing_labels_fast( + self, + state, + X_train, + X_val, + y_train, + y_val, + X_train_all, + y_train_all, + is_spark_dataframe, + data_is_df, + ): + """Handle missing labels by adding first instance to the set with missing label. + + This is the faster version that may create some overlap but ensures all labels + are present in both sets. If a label is missing from train, it adds the first + instance to train. If a label is missing from val, it adds the first instance to val. + If no labels are missing, no instances are duplicated. + + Args: + state: The state object containing fit parameters + X_train, X_val: Training and validation features + y_train, y_val: Training and validation labels + X_train_all, y_train_all: Complete dataset + is_spark_dataframe: Whether data is pandas_on_spark + data_is_df: Whether data is DataFrame/Series + + Returns: + Tuple of (X_train, X_val, y_train, y_val) with missing labels added + """ + # Check which labels are present in train and val sets + if is_spark_dataframe: + label_set_train, _ = unique_pandas_on_spark(y_train) + label_set_val, _ = unique_pandas_on_spark(y_val) + label_set_all, first = unique_value_first_index(y_train_all) + else: + label_set_all, first = unique_value_first_index(y_train_all) + label_set_train = np.unique(y_train) + label_set_val = np.unique(y_val) + + # Find missing labels + missing_in_train = np.setdiff1d(label_set_all, label_set_train) + missing_in_val = np.setdiff1d(label_set_all, label_set_val) + + # Add first instance of missing labels to train set + if len(missing_in_train) > 0: + missing_train_indices = [] + for label in missing_in_train: + label_matches = np.where(label_set_all == label)[0] + if len(label_matches) > 0 and label_matches[0] < len(first): + missing_train_indices.append(first[label_matches[0]]) + + if len(missing_train_indices) > 0: + X_missing_train = ( + iloc_pandas_on_spark(X_train_all, missing_train_indices) + if is_spark_dataframe + else X_train_all.iloc[missing_train_indices] + if data_is_df + else X_train_all[missing_train_indices] + ) + y_missing_train = ( + iloc_pandas_on_spark(y_train_all, missing_train_indices) + if is_spark_dataframe + else y_train_all.iloc[missing_train_indices] + if isinstance(y_train_all, (pd.Series, psSeries)) + else y_train_all[missing_train_indices] + ) + X_train = concat(X_missing_train, X_train) + y_train = concat(y_missing_train, y_train) if data_is_df else np.concatenate([y_missing_train, y_train]) + + # Handle sample_weight if present + if "sample_weight" in state.fit_kwargs: + sample_weight_source = ( + state.sample_weight_all + if hasattr(state, "sample_weight_all") + else state.fit_kwargs.get("sample_weight") + ) + if sample_weight_source is not None and max(missing_train_indices) < len(sample_weight_source): + missing_weights = ( + sample_weight_source[missing_train_indices] + if isinstance(sample_weight_source, np.ndarray) + else sample_weight_source.iloc[missing_train_indices] + ) + state.fit_kwargs["sample_weight"] = concat(missing_weights, state.fit_kwargs["sample_weight"]) + + # Add first instance of missing labels to val set + if len(missing_in_val) > 0: + missing_val_indices = [] + for label in missing_in_val: + label_matches = np.where(label_set_all == label)[0] + if len(label_matches) > 0 and label_matches[0] < len(first): + missing_val_indices.append(first[label_matches[0]]) + + if len(missing_val_indices) > 0: + X_missing_val = ( + iloc_pandas_on_spark(X_train_all, missing_val_indices) + if is_spark_dataframe + else X_train_all.iloc[missing_val_indices] + if data_is_df + else X_train_all[missing_val_indices] + ) + y_missing_val = ( + iloc_pandas_on_spark(y_train_all, missing_val_indices) + if is_spark_dataframe + else y_train_all.iloc[missing_val_indices] + if isinstance(y_train_all, (pd.Series, psSeries)) + else y_train_all[missing_val_indices] + ) + X_val = concat(X_missing_val, X_val) + y_val = concat(y_missing_val, y_val) if data_is_df else np.concatenate([y_missing_val, y_val]) + + # Handle sample_weight if present + if ( + "sample_weight" in state.fit_kwargs + and hasattr(state, "weight_val") + and state.weight_val is not None + ): + sample_weight_source = ( + state.sample_weight_all + if hasattr(state, "sample_weight_all") + else state.fit_kwargs.get("sample_weight") + ) + if sample_weight_source is not None and max(missing_val_indices) < len(sample_weight_source): + missing_weights = ( + sample_weight_source[missing_val_indices] + if isinstance(sample_weight_source, np.ndarray) + else sample_weight_source.iloc[missing_val_indices] + ) + state.weight_val = concat(missing_weights, state.weight_val) + + return X_train, X_val, y_train, y_val + + def _handle_missing_labels_no_overlap( + self, + state, + X_train, + X_val, + y_train, + y_val, + X_train_all, + y_train_all, + is_spark_dataframe, + data_is_df, + split_ratio, + ): + """Handle missing labels intelligently to avoid overlap when possible. + + This is the slower but more precise version that: + - For single-instance classes: Adds to both sets (unavoidable overlap) + - For multi-instance classes: Re-splits them properly to avoid overlap + + Args: + state: The state object containing fit parameters + X_train, X_val: Training and validation features + y_train, y_val: Training and validation labels + X_train_all, y_train_all: Complete dataset + is_spark_dataframe: Whether data is pandas_on_spark + data_is_df: Whether data is DataFrame/Series + split_ratio: The ratio for splitting + + Returns: + Tuple of (X_train, X_val, y_train, y_val) with missing labels handled + """ + # Check which labels are present in train and val sets + if is_spark_dataframe: + label_set_train, _ = unique_pandas_on_spark(y_train) + label_set_val, _ = unique_pandas_on_spark(y_val) + label_set_all, first = unique_value_first_index(y_train_all) + else: + label_set_all, first = unique_value_first_index(y_train_all) + label_set_train = np.unique(y_train) + label_set_val = np.unique(y_val) + + # Find missing labels + missing_in_train = np.setdiff1d(label_set_all, label_set_train) + missing_in_val = np.setdiff1d(label_set_all, label_set_val) + + # Handle missing labels intelligently + # For classes with only 1 instance: add to both sets (unavoidable overlap) + # For classes with multiple instances: move/split them properly to avoid overlap + + if len(missing_in_train) > 0: + # Process missing labels in training set + for label in missing_in_train: + # Find all indices for this label in the original data + if is_spark_dataframe: + label_indices = np.where(y_train_all.to_numpy() == label)[0].tolist() + else: + label_indices = np.where(np.asarray(y_train_all) == label)[0].tolist() + + num_instances = len(label_indices) + + if num_instances == 1: + # Single instance: must add to both train and val (unavoidable overlap) + X_single = ( + iloc_pandas_on_spark(X_train_all, label_indices) + if is_spark_dataframe + else X_train_all.iloc[label_indices] + if data_is_df + else X_train_all[label_indices] + ) + y_single = ( + iloc_pandas_on_spark(y_train_all, label_indices) + if is_spark_dataframe + else y_train_all.iloc[label_indices] + if isinstance(y_train_all, (pd.Series, psSeries)) + else y_train_all[label_indices] + ) + X_train = concat(X_single, X_train) + y_train = concat(y_single, y_train) if data_is_df else np.concatenate([y_single, y_train]) + + # Handle sample_weight + if "sample_weight" in state.fit_kwargs: + sample_weight_source = ( + state.sample_weight_all + if hasattr(state, "sample_weight_all") + else state.fit_kwargs.get("sample_weight") + ) + if sample_weight_source is not None and label_indices[0] < len(sample_weight_source): + single_weight = ( + sample_weight_source[label_indices] + if isinstance(sample_weight_source, np.ndarray) + else sample_weight_source.iloc[label_indices] + ) + state.fit_kwargs["sample_weight"] = concat(single_weight, state.fit_kwargs["sample_weight"]) + else: + # Multiple instances: move some from val to train (no overlap needed) + # Calculate how many to move to train (leave at least 1 in val) + num_to_train = max(1, min(num_instances - 1, int(num_instances * (1 - split_ratio)))) + indices_to_move = label_indices[:num_to_train] + + X_to_move = ( + iloc_pandas_on_spark(X_train_all, indices_to_move) + if is_spark_dataframe + else X_train_all.iloc[indices_to_move] + if data_is_df + else X_train_all[indices_to_move] + ) + y_to_move = ( + iloc_pandas_on_spark(y_train_all, indices_to_move) + if is_spark_dataframe + else y_train_all.iloc[indices_to_move] + if isinstance(y_train_all, (pd.Series, psSeries)) + else y_train_all[indices_to_move] + ) + + # Add to train + X_train = concat(X_to_move, X_train) + y_train = concat(y_to_move, y_train) if data_is_df else np.concatenate([y_to_move, y_train]) + + # Remove from val (they are currently all in val) + if is_spark_dataframe: + val_mask = ~y_val.isin([label]) + X_val = X_val[val_mask] + y_val = y_val[val_mask] + else: + val_mask = np.asarray(y_val) != label + if data_is_df: + X_val = X_val[val_mask] + y_val = y_val[val_mask] + else: + X_val = X_val[val_mask] + y_val = y_val[val_mask] + + # Add remaining instances back to val + remaining_indices = label_indices[num_to_train:] + if len(remaining_indices) > 0: + X_remaining = ( + iloc_pandas_on_spark(X_train_all, remaining_indices) + if is_spark_dataframe + else X_train_all.iloc[remaining_indices] + if data_is_df + else X_train_all[remaining_indices] + ) + y_remaining = ( + iloc_pandas_on_spark(y_train_all, remaining_indices) + if is_spark_dataframe + else y_train_all.iloc[remaining_indices] + if isinstance(y_train_all, (pd.Series, psSeries)) + else y_train_all[remaining_indices] + ) + X_val = concat(X_remaining, X_val) + y_val = concat(y_remaining, y_val) if data_is_df else np.concatenate([y_remaining, y_val]) + + # Handle sample_weight + if "sample_weight" in state.fit_kwargs: + sample_weight_source = ( + state.sample_weight_all + if hasattr(state, "sample_weight_all") + else state.fit_kwargs.get("sample_weight") + ) + if sample_weight_source is not None and max(indices_to_move) < len(sample_weight_source): + weights_to_move = ( + sample_weight_source[indices_to_move] + if isinstance(sample_weight_source, np.ndarray) + else sample_weight_source.iloc[indices_to_move] + ) + state.fit_kwargs["sample_weight"] = concat( + weights_to_move, state.fit_kwargs["sample_weight"] + ) + + if ( + len(remaining_indices) > 0 + and hasattr(state, "weight_val") + and state.weight_val is not None + ): + # Remove and re-add weights for val + if isinstance(state.weight_val, np.ndarray): + state.weight_val = state.weight_val[val_mask] + else: + state.weight_val = state.weight_val[val_mask] + + if max(remaining_indices) < len(sample_weight_source): + remaining_weights = ( + sample_weight_source[remaining_indices] + if isinstance(sample_weight_source, np.ndarray) + else sample_weight_source.iloc[remaining_indices] + ) + state.weight_val = concat(remaining_weights, state.weight_val) + + if len(missing_in_val) > 0: + # Process missing labels in validation set + for label in missing_in_val: + # Find all indices for this label in the original data + if is_spark_dataframe: + label_indices = np.where(y_train_all.to_numpy() == label)[0].tolist() + else: + label_indices = np.where(np.asarray(y_train_all) == label)[0].tolist() + + num_instances = len(label_indices) + + if num_instances == 1: + # Single instance: must add to both train and val (unavoidable overlap) + X_single = ( + iloc_pandas_on_spark(X_train_all, label_indices) + if is_spark_dataframe + else X_train_all.iloc[label_indices] + if data_is_df + else X_train_all[label_indices] + ) + y_single = ( + iloc_pandas_on_spark(y_train_all, label_indices) + if is_spark_dataframe + else y_train_all.iloc[label_indices] + if isinstance(y_train_all, (pd.Series, psSeries)) + else y_train_all[label_indices] + ) + X_val = concat(X_single, X_val) + y_val = concat(y_single, y_val) if data_is_df else np.concatenate([y_single, y_val]) + + # Handle sample_weight + if "sample_weight" in state.fit_kwargs and hasattr(state, "weight_val"): + sample_weight_source = ( + state.sample_weight_all + if hasattr(state, "sample_weight_all") + else state.fit_kwargs.get("sample_weight") + ) + if sample_weight_source is not None and label_indices[0] < len(sample_weight_source): + single_weight = ( + sample_weight_source[label_indices] + if isinstance(sample_weight_source, np.ndarray) + else sample_weight_source.iloc[label_indices] + ) + if state.weight_val is not None: + state.weight_val = concat(single_weight, state.weight_val) + else: + # Multiple instances: move some from train to val (no overlap needed) + # Calculate how many to move to val (leave at least 1 in train) + num_to_val = max(1, min(num_instances - 1, int(num_instances * split_ratio))) + indices_to_move = label_indices[:num_to_val] + + X_to_move = ( + iloc_pandas_on_spark(X_train_all, indices_to_move) + if is_spark_dataframe + else X_train_all.iloc[indices_to_move] + if data_is_df + else X_train_all[indices_to_move] + ) + y_to_move = ( + iloc_pandas_on_spark(y_train_all, indices_to_move) + if is_spark_dataframe + else y_train_all.iloc[indices_to_move] + if isinstance(y_train_all, (pd.Series, psSeries)) + else y_train_all[indices_to_move] + ) + + # Add to val + X_val = concat(X_to_move, X_val) + y_val = concat(y_to_move, y_val) if data_is_df else np.concatenate([y_to_move, y_val]) + + # Remove from train (they are currently all in train) + if is_spark_dataframe: + train_mask = ~y_train.isin([label]) + X_train = X_train[train_mask] + y_train = y_train[train_mask] + else: + train_mask = np.asarray(y_train) != label + if data_is_df: + X_train = X_train[train_mask] + y_train = y_train[train_mask] + else: + X_train = X_train[train_mask] + y_train = y_train[train_mask] + + # Add remaining instances back to train + remaining_indices = label_indices[num_to_val:] + if len(remaining_indices) > 0: + X_remaining = ( + iloc_pandas_on_spark(X_train_all, remaining_indices) + if is_spark_dataframe + else X_train_all.iloc[remaining_indices] + if data_is_df + else X_train_all[remaining_indices] + ) + y_remaining = ( + iloc_pandas_on_spark(y_train_all, remaining_indices) + if is_spark_dataframe + else y_train_all.iloc[remaining_indices] + if isinstance(y_train_all, (pd.Series, psSeries)) + else y_train_all[remaining_indices] + ) + X_train = concat(X_remaining, X_train) + y_train = concat(y_remaining, y_train) if data_is_df else np.concatenate([y_remaining, y_train]) + + # Handle sample_weight + if "sample_weight" in state.fit_kwargs: + sample_weight_source = ( + state.sample_weight_all + if hasattr(state, "sample_weight_all") + else state.fit_kwargs.get("sample_weight") + ) + if sample_weight_source is not None and max(indices_to_move) < len(sample_weight_source): + weights_to_move = ( + sample_weight_source[indices_to_move] + if isinstance(sample_weight_source, np.ndarray) + else sample_weight_source.iloc[indices_to_move] + ) + if hasattr(state, "weight_val") and state.weight_val is not None: + state.weight_val = concat(weights_to_move, state.weight_val) + + if len(remaining_indices) > 0: + # Remove and re-add weights for train + if isinstance(state.fit_kwargs["sample_weight"], np.ndarray): + state.fit_kwargs["sample_weight"] = state.fit_kwargs["sample_weight"][train_mask] + else: + state.fit_kwargs["sample_weight"] = state.fit_kwargs["sample_weight"][train_mask] + + if max(remaining_indices) < len(sample_weight_source): + remaining_weights = ( + sample_weight_source[remaining_indices] + if isinstance(sample_weight_source, np.ndarray) + else sample_weight_source.iloc[remaining_indices] + ) + state.fit_kwargs["sample_weight"] = concat( + remaining_weights, state.fit_kwargs["sample_weight"] + ) + + return X_train, X_val, y_train, y_val + def prepare_data( self, state, @@ -377,6 +836,7 @@ def prepare_data( n_splits, data_is_df, sample_weight_full, + allow_label_overlap=True, ) -> int: X_val, y_val = state.X_val, state.y_val if issparse(X_val): @@ -505,59 +965,46 @@ def prepare_data( elif self.is_classification(): # for classification, make sure the labels are complete in both # training and validation data - label_set, first = unique_value_first_index(y_train_all) - rest = [] - last = 0 - first.sort() - for i in range(len(first)): - rest.extend(range(last, first[i])) - last = first[i] + 1 - rest.extend(range(last, len(y_train_all))) - X_first = X_train_all.iloc[first] if data_is_df else X_train_all[first] - if len(first) < len(y_train_all) / 2: - # Get X_rest and y_rest with drop, sparse matrix can't apply np.delete - X_rest = ( - np.delete(X_train_all, first, axis=0) - if isinstance(X_train_all, np.ndarray) - else X_train_all.drop(first.tolist()) - if data_is_df - else X_train_all[rest] - ) - y_rest = ( - np.delete(y_train_all, first, axis=0) - if isinstance(y_train_all, np.ndarray) - else y_train_all.drop(first.tolist()) - if data_is_df - else y_train_all[rest] + stratify = y_train_all if split_type == "stratified" else None + X_train, X_val, y_train, y_val = self._train_test_split( + state, X_train_all, y_train_all, split_ratio=split_ratio, stratify=stratify + ) + + # Handle missing labels using the appropriate strategy + if allow_label_overlap: + # Fast version: adds first instance to set with missing label (may create overlap) + X_train, X_val, y_train, y_val = self._handle_missing_labels_fast( + state, + X_train, + X_val, + y_train, + y_val, + X_train_all, + y_train_all, + is_spark_dataframe, + data_is_df, ) else: - X_rest = ( - iloc_pandas_on_spark(X_train_all, rest) - if is_spark_dataframe - else X_train_all.iloc[rest] - if data_is_df - else X_train_all[rest] - ) - y_rest = ( - iloc_pandas_on_spark(y_train_all, rest) - if is_spark_dataframe - else y_train_all.iloc[rest] - if data_is_df - else y_train_all[rest] + # Precise version: avoids overlap when possible (slower) + X_train, X_val, y_train, y_val = self._handle_missing_labels_no_overlap( + state, + X_train, + X_val, + y_train, + y_val, + X_train_all, + y_train_all, + is_spark_dataframe, + data_is_df, + split_ratio, ) - stratify = y_rest if split_type == "stratified" else None - X_train, X_val, y_train, y_val = self._train_test_split( - state, X_rest, y_rest, first, rest, split_ratio, stratify - ) - X_train = concat(X_first, X_train) - y_train = concat(label_set, y_train) if data_is_df else np.concatenate([label_set, y_train]) - X_val = concat(X_first, X_val) - y_val = concat(label_set, y_val) if data_is_df else np.concatenate([label_set, y_val]) if isinstance(y_train, (psDataFrame, pd.DataFrame)) and y_train.shape[1] == 1: y_train = y_train[y_train.columns[0]] y_val = y_val[y_val.columns[0]] - y_train.name = y_val.name = y_rest.name + # Only set name if y_train_all is a Series (not a DataFrame) + if isinstance(y_train_all, (pd.Series, psSeries)): + y_train.name = y_val.name = y_train_all.name elif self.is_regression(): X_train, X_val, y_train, y_val = self._train_test_split( diff --git a/test/automl/test_no_overlap.py b/test/automl/test_no_overlap.py new file mode 100644 index 0000000000..443d8b9980 --- /dev/null +++ b/test/automl/test_no_overlap.py @@ -0,0 +1,272 @@ +"""Test to ensure correct label overlap handling for classification tasks""" +import numpy as np +import pandas as pd +from sklearn.datasets import load_iris, make_classification + +from flaml import AutoML + + +def test_allow_label_overlap_true(): + """Test with allow_label_overlap=True (fast mode, default)""" + # Load iris dataset + dic_data = load_iris(as_frame=True) + iris_data = dic_data["frame"] + + # Prepare data + x_train = iris_data[["sepal length (cm)", "sepal width (cm)", "petal length (cm)", "petal width (cm)"]].to_numpy() + y_train = iris_data["target"] + + # Train with fast mode (default) + automl = AutoML() + automl_settings = { + "max_iter": 5, + "metric": "accuracy", + "task": "classification", + "estimator_list": ["lgbm"], + "eval_method": "holdout", + "split_type": "stratified", + "keep_search_state": True, + "retrain_full": False, + "auto_augment": False, + "verbose": 0, + "allow_label_overlap": True, # Fast mode + } + automl.fit(x_train, y_train, **automl_settings) + + # Check results + input_size = len(x_train) + train_size = len(automl._state.X_train) + val_size = len(automl._state.X_val) + + # With stratified split on balanced data, fast mode may have no overlap + assert ( + train_size + val_size >= input_size + ), f"Inconsistent sizes. Input: {input_size}, Train: {train_size}, Val: {val_size}" + + # Verify all classes are represented in both sets + train_labels = set(np.unique(automl._state.y_train)) + val_labels = set(np.unique(automl._state.y_val)) + all_labels = set(np.unique(y_train)) + + assert train_labels == all_labels, f"Not all labels in train. All: {all_labels}, Train: {train_labels}" + assert val_labels == all_labels, f"Not all labels in val. All: {all_labels}, Val: {val_labels}" + + print( + f"✓ Test passed (fast mode): Input: {input_size}, Train: {train_size}, Val: {val_size}, " + f"Overlap: {train_size + val_size - input_size}" + ) + + +def test_allow_label_overlap_false(): + """Test with allow_label_overlap=False (precise mode)""" + # Load iris dataset + dic_data = load_iris(as_frame=True) + iris_data = dic_data["frame"] + + # Prepare data + x_train = iris_data[["sepal length (cm)", "sepal width (cm)", "petal length (cm)", "petal width (cm)"]].to_numpy() + y_train = iris_data["target"] + + # Train with precise mode + automl = AutoML() + automl_settings = { + "max_iter": 5, + "metric": "accuracy", + "task": "classification", + "estimator_list": ["lgbm"], + "eval_method": "holdout", + "split_type": "stratified", + "keep_search_state": True, + "retrain_full": False, + "auto_augment": False, + "verbose": 0, + "allow_label_overlap": False, # Precise mode + } + automl.fit(x_train, y_train, **automl_settings) + + # Check that there's no overlap (or minimal overlap for single-instance classes) + input_size = len(x_train) + train_size = len(automl._state.X_train) + val_size = len(automl._state.X_val) + + # Verify all classes are represented + all_labels = set(np.unique(y_train)) + + # Should have no overlap or minimal overlap + overlap = train_size + val_size - input_size + assert overlap <= len(all_labels), f"Excessive overlap: {overlap}" + + # Verify all classes are represented + train_labels = set(np.unique(automl._state.y_train)) + val_labels = set(np.unique(automl._state.y_val)) + + combined_labels = train_labels.union(val_labels) + assert combined_labels == all_labels, f"Not all labels present. All: {all_labels}, Combined: {combined_labels}" + + print( + f"✓ Test passed (precise mode): Input: {input_size}, Train: {train_size}, Val: {val_size}, " + f"Overlap: {overlap}" + ) + + +def test_uniform_split_with_overlap_control(): + """Test with uniform split and both overlap modes""" + # Load iris dataset + dic_data = load_iris(as_frame=True) + iris_data = dic_data["frame"] + + # Prepare data + x_train = iris_data[["sepal length (cm)", "sepal width (cm)", "petal length (cm)", "petal width (cm)"]].to_numpy() + y_train = iris_data["target"] + + # Test precise mode with uniform split + automl = AutoML() + automl_settings = { + "max_iter": 5, + "metric": "accuracy", + "task": "classification", + "estimator_list": ["lgbm"], + "eval_method": "holdout", + "split_type": "uniform", + "keep_search_state": True, + "retrain_full": False, + "auto_augment": False, + "verbose": 0, + "allow_label_overlap": False, # Precise mode + } + automl.fit(x_train, y_train, **automl_settings) + + input_size = len(x_train) + train_size = len(automl._state.X_train) + val_size = len(automl._state.X_val) + + # Verify all classes are represented + train_labels = set(np.unique(automl._state.y_train)) + val_labels = set(np.unique(automl._state.y_val)) + all_labels = set(np.unique(y_train)) + + combined_labels = train_labels.union(val_labels) + assert combined_labels == all_labels, "Not all labels present with uniform split" + + print(f"✓ Test passed (uniform split): Input: {input_size}, Train: {train_size}, Val: {val_size}") + + +def test_with_sample_weights(): + """Test label overlap handling with sample weights""" + # Create a simple dataset + X, y = make_classification( + n_samples=200, + n_features=10, + n_informative=5, + n_redundant=2, + n_classes=3, + n_clusters_per_class=1, + random_state=42, + ) + + # Create sample weights (giving more weight to some samples) + sample_weight = np.random.uniform(0.5, 2.0, size=len(y)) + + # Test fast mode with sample weights + automl_fast = AutoML() + automl_fast.fit( + X, + y, + task="classification", + metric="accuracy", + estimator_list=["lgbm"], + eval_method="holdout", + split_type="stratified", + max_iter=3, + keep_search_state=True, + retrain_full=False, + auto_augment=False, + verbose=0, + allow_label_overlap=True, # Fast mode + sample_weight=sample_weight, + ) + + # Verify all labels present + train_labels_fast = set(np.unique(automl_fast._state.y_train)) + val_labels_fast = set(np.unique(automl_fast._state.y_val)) + all_labels = set(np.unique(y)) + + assert train_labels_fast == all_labels, "Not all labels in train (fast mode with weights)" + assert val_labels_fast == all_labels, "Not all labels in val (fast mode with weights)" + + # Test precise mode with sample weights + automl_precise = AutoML() + automl_precise.fit( + X, + y, + task="classification", + metric="accuracy", + estimator_list=["lgbm"], + eval_method="holdout", + split_type="stratified", + max_iter=3, + keep_search_state=True, + retrain_full=False, + auto_augment=False, + verbose=0, + allow_label_overlap=False, # Precise mode + sample_weight=sample_weight, + ) + + # Verify all labels present + train_labels_precise = set(np.unique(automl_precise._state.y_train)) + val_labels_precise = set(np.unique(automl_precise._state.y_val)) + + combined_labels = train_labels_precise.union(val_labels_precise) + assert combined_labels == all_labels, "Not all labels present (precise mode with weights)" + + print("✓ Test passed with sample weights (fast and precise modes)") + + +def test_single_instance_class(): + """Test handling of single-instance classes""" + # Create imbalanced dataset where one class has only 1 instance + X = np.random.randn(50, 4) + y = np.array([0] * 40 + [1] * 9 + [2] * 1) # Class 2 has only 1 instance + + # Test precise mode - should add single instance to both sets + automl = AutoML() + automl.fit( + X, + y, + task="classification", + metric="accuracy", + estimator_list=["lgbm"], + eval_method="holdout", + split_type="uniform", + max_iter=3, + keep_search_state=True, + retrain_full=False, + auto_augment=False, + verbose=0, + allow_label_overlap=False, # Precise mode + ) + + # Verify all labels present + train_labels = set(np.unique(automl._state.y_train)) + val_labels = set(np.unique(automl._state.y_val)) + all_labels = set(np.unique(y)) + + # Single-instance class should be in both sets + combined_labels = train_labels.union(val_labels) + assert combined_labels == all_labels, "Not all labels present with single-instance class" + + # Check that single-instance class (label 2) is in both sets + assert 2 in train_labels, "Single-instance class not in train" + assert 2 in val_labels, "Single-instance class not in val" + + print("✓ Test passed with single-instance class") + + +if __name__ == "__main__": + test_allow_label_overlap_true() + test_allow_label_overlap_false() + test_uniform_split_with_overlap_control() + test_with_sample_weights() + test_single_instance_class() + print("\n✓ All tests passed!") diff --git a/website/docs/Best-Practices.md b/website/docs/Best-Practices.md index a1e87f11cd..70c8d75610 100644 --- a/website/docs/Best-Practices.md +++ b/website/docs/Best-Practices.md @@ -28,6 +28,35 @@ print( - pass `sample_weight` to `AutoML.fit()`; - consider setting class weights via `custom_hp` / `fit_kwargs_by_estimator` for specific estimators (see [FAQ](FAQ)). - **Probability vs label metrics**: use `roc_auc` / `log_loss` when you care about calibrated probabilities. +- **Label overlap control** (holdout evaluation only): + - By default, FLAML uses a fast strategy (`allow_label_overlap=True`) that ensures all labels are present in both training and validation sets by adding missing labels' first instances to both sets. This is efficient but may create minor overlap. + - For strict no-overlap validation, use `allow_label_overlap=False`. This slower but more precise strategy intelligently re-splits multi-instance classes to avoid overlap while maintaining label completeness. + +```python +from flaml import AutoML + +# Fast version (default): allows overlap for efficiency +automl_fast = AutoML() +automl_fast.fit( + X_train, + y_train, + task="classification", + eval_method="holdout", + allow_label_overlap=True, +) # default + +# Precise version: avoids overlap when possible +automl_precise = AutoML() +automl_precise.fit( + X_train, + y_train, + task="classification", + eval_method="holdout", + allow_label_overlap=False, +) # slower but more precise +``` + +Note: This only affects holdout evaluation. CV and custom validation sets are unaffected. ## Regression