Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
43c857d
Initial plan
Copilot Jan 19, 2026
ea986ad
Fix training/test set overlap in holdout classification by only addin…
Copilot Jan 19, 2026
d114a01
Address code review feedback: add bounds checking and fix edge cases
Copilot Jan 19, 2026
09366e9
Fix bounds checking: use correct comparison operator for array indexing
Copilot Jan 19, 2026
ef2bbf8
Fix potential ValueError with max() on empty lists and simplify test …
Copilot Jan 20, 2026
4a1ddda
Add extra bounds checking for label_matches indices
Copilot Jan 20, 2026
11787ce
Merge branch 'main' into copilot/fix-training-test-set-overlap
thinkall Jan 20, 2026
1e832d8
Merge branch 'main' into copilot/fix-training-test-set-overlap
thinkall Jan 20, 2026
ee8a3aa
Fix pandas_on_spark compatibility by using iloc_pandas_on_spark util …
Copilot Jan 20, 2026
53d8c20
Run pre-commit to fix formatting issues
Copilot Jan 20, 2026
3c8421e
Improve missing label handling to avoid overlap when possible
Copilot Jan 20, 2026
4d841d6
Fix: ensure at least 1 instance remains in original set when re-split…
Copilot Jan 20, 2026
a6f93c4
Merge branch 'main' into copilot/fix-training-test-set-overlap
thinkall Jan 20, 2026
3750f7d
Merge branch 'main' into copilot/fix-training-test-set-overlap
thinkall Jan 20, 2026
b7e562b
Refactor: Extract missing label handling into two methods (fast/slow)
Copilot Jan 20, 2026
d96b8ff
Add usage example for allow_label_overlap to Best-Practices.md
Copilot Jan 20, 2026
15205f7
Merge branch 'main' into copilot/fix-training-test-set-overlap
thinkall Jan 20, 2026
8038a6a
Address code review feedback: update documentation and tests
Copilot Jan 20, 2026
d4c9525
Merge branch 'main' into copilot/fix-training-test-set-overlap
thinkall Jan 21, 2026
5e0b138
Fix AttributeError: initialize _allow_label_overlap in settings and r…
Copilot Jan 21, 2026
b0efb54
Add docstring to fit()
thinkall Jan 21, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions flaml/automl/automl.py
Original file line number Diff line number Diff line change
Expand Up @@ -343,6 +343,12 @@ def custom_metric(
}
```
skip_transform: boolean, default=False | Whether to pre-process data prior to modeling.
allow_label_overlap: boolean, default=True | For classification tasks with holdout evaluation,
whether to allow label overlap between train and validation sets. When True (default),
uses a fast strategy that adds the first instance of missing labels to the set that is
missing them, which may create some overlap. When False, uses a precise but slower
strategy that intelligently re-splits instances to avoid overlap when possible.
Only affects classification tasks with holdout evaluation method.
Comment thread
thinkall marked this conversation as resolved.
fit_kwargs_by_estimator: dict, default=None | The user specified keywords arguments, grouped by estimator name.
e.g.,

Expand Down Expand Up @@ -373,6 +379,7 @@ def custom_metric(
settings["split_ratio"] = settings.get("split_ratio", SPLIT_RATIO)
settings["n_splits"] = settings.get("n_splits", N_SPLITS)
settings["auto_augment"] = settings.get("auto_augment", True)
settings["allow_label_overlap"] = settings.get("allow_label_overlap", True)
settings["metric"] = settings.get("metric", "auto")
# Validate that custom metric is callable if not a string
self._validate_metric_parameter(settings["metric"], allow_auto=True)
Expand Down Expand Up @@ -1113,6 +1120,7 @@ def retrain_from_log(
eval_method = self._decide_eval_method(eval_method, time_budget)
self.modelcount = 0
self._auto_augment = auto_augment
self._allow_label_overlap = self._settings.get("allow_label_overlap", True)
self._prepare_data(eval_method, split_ratio, n_splits)
self._state.time_budget = -1
self._state.free_mem_ratio = 0
Expand Down Expand Up @@ -1716,6 +1724,7 @@ def _prepare_data(self, eval_method, split_ratio, n_splits):
n_splits,
self._df,
self._sample_weight_full,
self._allow_label_overlap,
)
self.data_size_full = self._state.data_size_full

Expand Down Expand Up @@ -1772,6 +1781,7 @@ def fit(
time_col=None,
cv_score_agg_func=None,
skip_transform=None,
allow_label_overlap=True,
mlflow_logging=None,
fit_kwargs_by_estimator=None,
mlflow_exp_name=None,
Expand Down Expand Up @@ -2058,6 +2068,12 @@ def cv_score_agg_func(val_loss_folds, log_metrics_folds):
```

skip_transform: boolean, default=False | Whether to pre-process data prior to modeling.
allow_label_overlap: boolean, default=True | For classification tasks with holdout evaluation,
whether to allow label overlap between train and validation sets. When True (default),
uses a fast strategy that adds the first instance of missing labels to the set that is
missing them, which may create some overlap. When False, uses a precise but slower
strategy that intelligently re-splits instances to avoid overlap when possible.
Only affects classification tasks with holdout evaluation method.
mlflow_logging: boolean, default=None | Whether to log the training results to mlflow.
Default value is None, which means the logging decision is made based on
AutoML.__init__'s mlflow_logging argument. Not valid if mlflow is not installed.
Expand Down Expand Up @@ -2127,6 +2143,9 @@ def cv_score_agg_func(val_loss_folds, log_metrics_folds):
split_ratio = split_ratio or self._settings.get("split_ratio")
n_splits = n_splits or self._settings.get("n_splits")
auto_augment = self._settings.get("auto_augment") if auto_augment is None else auto_augment
allow_label_overlap = (
self._settings.get("allow_label_overlap") if allow_label_overlap is None else allow_label_overlap
)
metric = self._settings.get("metric") if metric is None else metric
estimator_list = estimator_list or self._settings.get("estimator_list")
log_file_name = self._settings.get("log_file_name") if log_file_name is None else log_file_name
Expand Down Expand Up @@ -2309,6 +2328,7 @@ def cv_score_agg_func(val_loss_folds, log_metrics_folds):

self._retrain_in_budget = retrain_full == "budget" and (eval_method == "holdout" and self._state.X_val is None)
self._auto_augment = auto_augment
self._allow_label_overlap = allow_label_overlap

_sample_size_from_starting_points = {}
if isinstance(starting_points, dict):
Expand Down
Loading
Loading