From 4b2d53ba38430280185860abccf4a9ef0d486fc9 Mon Sep 17 00:00:00 2001 From: Daniel Lacina Date: Wed, 9 Jul 2025 13:20:37 -0500 Subject: [PATCH 1/4] implemented extra trees --- Cargo.toml | 2 + src/ensemble/extra_trees_regressor.rs | 320 ++++++++++ src/ensemble/forest_regressor.rs | 222 +++++++ src/ensemble/mod.rs | 2 + src/ensemble/random_forest_regressor.rs | 151 +---- src/lib.rs | 1 - src/tree/decision_tree_regressor.rs | 178 ++++-- src/xgboost/mod.rs | 16 - src/xgboost/xgb_regressor.rs | 762 ------------------------ 9 files changed, 700 insertions(+), 954 deletions(-) create mode 100644 src/ensemble/extra_trees_regressor.rs create mode 100644 src/ensemble/forest_regressor.rs delete mode 100644 src/xgboost/mod.rs delete mode 100644 src/xgboost/xgb_regressor.rs diff --git a/Cargo.toml b/Cargo.toml index 3c1b8ab9..e48b5d0c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -28,6 +28,8 @@ num = "0.4" rand = { version = "0.8.5", default-features = false, features = ["small_rng"] } rand_distr = { version = "0.4", optional = true } serde = { version = "1", features = ["derive"], optional = true } +keyed_priority_queue = "0.4.2" +ordered-float = "5.0.0" [target.'cfg(not(target_arch = "wasm32"))'.dependencies] typetag = { version = "0.2", optional = true } diff --git a/src/ensemble/extra_trees_regressor.rs b/src/ensemble/extra_trees_regressor.rs new file mode 100644 index 00000000..88ea1ce8 --- /dev/null +++ b/src/ensemble/extra_trees_regressor.rs @@ -0,0 +1,320 @@ +//! # Extra Trees Regressor +//! An Extra-Trees (Extremely Randomized Trees) regressor is an ensemble learning method that fits multiple randomized +//! decision trees on the dataset and averages their predictions to improve accuracy and control over-fitting. +//! +//! It is similar to a standard Random Forest, but introduces more randomness in the way splits are chosen, which can +//! reduce the variance of the model and often make the training process faster. +//! +//! The two key differences from a standard Random Forest are: +//! 1. It uses the whole original dataset to build each tree instead of bootstrap samples. +//! 2. When splitting a node, it chooses a random split point for each feature, rather than the most optimal one. +//! +//! See [ensemble models](../index.html) for more details. +//! +//! Bigger number of estimators in general improves performance of the algorithm with an increased cost of training time. +//! The random sample of _m_ predictors is typically set to be \\(\sqrt{p}\\) from the full set of _p_ predictors. +//! +//! Example: +//! +//! ``` +//! use smartcore::linalg::basic::matrix::DenseMatrix; +//! use smartcore::ensemble::extra_trees_regressor::*; +//! +//! // Longley dataset ([https://www.statsmodels.org/stable/datasets/generated/longley.html](https://www.statsmodels.org/stable/datasets/generated/longley.html)) +//! let x = DenseMatrix::from_2d_array(&[ +//! &[234.289, 235.6, 159., 107.608, 1947., 60.323], +//! &[259.426, 232.5, 145.6, 108.632, 1948., 61.122], +//! &[258.054, 368.2, 161.6, 109.773, 1949., 60.171], +//! &[284.599, 335.1, 165., 110.929, 1950., 61.187], +//! &[328.975, 209.9, 309.9, 112.075, 1951., 63.221], +//! &[346.999, 193.2, 359.4, 113.27, 1952., 63.639], +//! &[365.385, 187., 354.7, 115.094, 1953., 64.989], +//! &[363.112, 357.8, 335., 116.219, 1954., 63.761], +//! &[397.469, 290.4, 304.8, 117.388, 1955., 66.019], +//! &[419.18, 282.2, 285.7, 118.734, 1956., 67.857], +//! &[442.769, 293.6, 279.8, 120.445, 1957., 68.169], +//! &[444.546, 468.1, 263.7, 121.95, 1958., 66.513], +//! &[482.704, 381.3, 255.2, 123.366, 1959., 68.655], +//! &[502.601, 393.1, 251.4, 125.368, 1960., 69.564], +//! &[518.173, 480.6, 257.2, 127.852, 1961., 69.331], +//! &[554.894, 400.7, 282.7, 130.081, 1962., 70.551], +//! ]).unwrap(); +//! let y = vec![ +//! 83.0, 88.5, 88.2, 89.5, 96.2, 98.1, 99.0, 100.0, 101.2, +//! 104.6, 108.4, 110.8, 112.6, 114.2, 115.7, 116.9 +//! ]; +//! +//! let regressor = ExtraTreesRegressor::fit(&x, &y, Default::default()).unwrap(); +//! +//! let y_hat = regressor.predict(&x).unwrap(); // use the same data for prediction +//! ``` +//! +//! +//! + +use std::default::Default; +use std::fmt::Debug; + +#[cfg(feature = "serde")] +use serde::{Deserialize, Serialize}; + +use crate::api::{Predictor, SupervisedEstimator}; +use crate::ensemble::forest_regressor::ForestRegressorParameters; +use crate::error::Failed; +use crate::linalg::basic::arrays::{Array1, Array2}; +use crate::numbers::basenum::Number; +use crate::numbers::floatnum::FloatNumber; + +use super::forest_regressor::ForestRegressor; + +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug, Clone)] +/// Parameters of the Extra Trees Regressor +/// Some parameters here are passed directly into base estimator. +pub struct ExtraTreesRegressorParameters { + #[cfg_attr(feature = "serde", serde(default))] + /// Tree max depth. See [Decision Tree Regressor](../../tree/decision_tree_regressor/index.html) + pub max_depth: Option, + #[cfg_attr(feature = "serde", serde(default))] + /// The minimum number of samples required to be at a leaf node. See [Decision Tree Regressor](../../tree/decision_tree_regressor/index.html) + pub min_samples_leaf: usize, + #[cfg_attr(feature = "serde", serde(default))] + /// The minimum number of samples required to split an internal node. See [Decision Tree Regressor](../../tree/decision_tree_regressor/index.html) + pub min_samples_split: usize, + #[cfg_attr(feature = "serde", serde(default))] + /// The number of trees in the forest. + pub n_trees: usize, + #[cfg_attr(feature = "serde", serde(default))] + /// Number of random sample of predictors to use as split candidates. + pub m: Option, + #[cfg_attr(feature = "serde", serde(default))] + /// Whether to keep samples used for tree generation. This is required for OOB prediction. + pub keep_samples: bool, + #[cfg_attr(feature = "serde", serde(default))] + /// Seed used for bootstrap sampling and feature selection for each tree. + pub seed: u64, +} + +/// Extra Trees Regressor +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug)] +pub struct ExtraTreesRegressor< + TX: Number + FloatNumber + PartialOrd, + TY: Number, + X: Array2, + Y: Array1, +> { + forest_regressor: Option>, +} + +impl ExtraTreesRegressorParameters { + /// Tree max depth. See [Decision Tree Classifier](../../tree/decision_tree_classifier/index.html) + pub fn with_max_depth(mut self, max_depth: u16) -> Self { + self.max_depth = Some(max_depth); + self + } + /// The minimum number of samples required to be at a leaf node. See [Decision Tree Classifier](../../tree/decision_tree_classifier/index.html) + pub fn with_min_samples_leaf(mut self, min_samples_leaf: usize) -> Self { + self.min_samples_leaf = min_samples_leaf; + self + } + /// The minimum number of samples required to split an internal node. See [Decision Tree Classifier](../../tree/decision_tree_classifier/index.html) + pub fn with_min_samples_split(mut self, min_samples_split: usize) -> Self { + self.min_samples_split = min_samples_split; + self + } + /// The number of trees in the forest. + pub fn with_n_trees(mut self, n_trees: usize) -> Self { + self.n_trees = n_trees; + self + } + /// Number of random sample of predictors to use as split candidates. + pub fn with_m(mut self, m: usize) -> Self { + self.m = Some(m); + self + } + + /// Whether to keep samples used for tree generation. This is required for OOB prediction. + pub fn with_keep_samples(mut self, keep_samples: bool) -> Self { + self.keep_samples = keep_samples; + self + } + + /// Seed used for bootstrap sampling and feature selection for each tree. + pub fn with_seed(mut self, seed: u64) -> Self { + self.seed = seed; + self + } +} +impl Default for ExtraTreesRegressorParameters { + fn default() -> Self { + ExtraTreesRegressorParameters { + max_depth: Option::None, + min_samples_leaf: 1, + min_samples_split: 2, + n_trees: 10, + m: Option::None, + keep_samples: false, + seed: 0, + } + } +} + +impl, Y: Array1> + SupervisedEstimator for ExtraTreesRegressor +{ + fn new() -> Self { + Self { + forest_regressor: Option::None, + } + } + + fn fit(x: &X, y: &Y, parameters: ExtraTreesRegressorParameters) -> Result { + ExtraTreesRegressor::fit(x, y, parameters) + } +} + +impl, Y: Array1> + Predictor for ExtraTreesRegressor +{ + fn predict(&self, x: &X) -> Result { + self.predict(x) + } +} + +impl, Y: Array1> + ExtraTreesRegressor +{ + /// Build a forest of trees from the training set. + /// * `x` - _NxM_ matrix with _N_ observations and _M_ features in each observation. + /// * `y` - the target class values + pub fn fit( + x: &X, + y: &Y, + parameters: ExtraTreesRegressorParameters, + ) -> Result, Failed> { + let regressor_params = ForestRegressorParameters { + max_depth: parameters.max_depth, + min_samples_leaf: parameters.min_samples_leaf, + min_samples_split: parameters.min_samples_split, + n_trees: parameters.n_trees, + m: parameters.m, + keep_samples: parameters.keep_samples, + seed: parameters.seed, + bootstrap: false, + use_random_splits: true, + }; + let forest_regressor = ForestRegressor::fit(x, y, regressor_params)?; + + Ok(ExtraTreesRegressor { + forest_regressor: Some(forest_regressor), + }) + } + + /// Predict class for `x` + /// * `x` - _KxM_ data where _K_ is number of observations and _M_ is number of features. + pub fn predict(&self, x: &X) -> Result { + let forest_regressor = self.forest_regressor.as_ref().unwrap(); + forest_regressor.predict(x) + } + + /// Predict OOB classes for `x`. `x` is expected to be equal to the dataset used in training. + pub fn predict_oob(&self, x: &X) -> Result { + let forest_regressor = self.forest_regressor.as_ref().unwrap(); + forest_regressor.predict_oob(x) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::linalg::basic::matrix::DenseMatrix; + use crate::metrics::mean_squared_error; + + #[test] + fn test_extra_trees_regressor_fit_predict() { + // Use a simpler, more predictable dataset for unit testing. + let x = DenseMatrix::from_2d_array(&[ + &[1., 2.], + &[3., 4.], + &[5., 6.], + &[7., 8.], + &[9., 10.], + &[11., 12.], + &[13., 14.], + &[15., 16.], + ]) + .unwrap(); + let y = vec![1., 2., 3., 4., 5., 6., 7., 8.]; + + let parameters = ExtraTreesRegressorParameters::default() + .with_n_trees(100) + .with_seed(42); + + let regressor = ExtraTreesRegressor::fit(&x, &y, parameters).unwrap(); + let y_hat = regressor.predict(&x).unwrap(); + + assert_eq!(y_hat.len(), y.len()); + // A basic check to ensure the model is learning something. + // The error should be significantly less than the variance of y. + let mse = mean_squared_error(&y, &y_hat); + println!("{}", mse); + // With this simple dataset, the error should be very low. + assert!(mse < 1.0); + } + + #[test] + fn test_fit_predict_higher_dims() { + // Dataset with 10 features, but y is only dependent on the 3rd feature (index 2). + let x = DenseMatrix::from_2d_array(&[ + // The 3rd column is the important one. The rest are noise. + &[0., 0., 10., 5., 8., 1., 4., 9., 2., 7.], + &[0., 0., 20., 1., 2., 3., 4., 5., 6., 7.], + &[0., 0., 30., 7., 6., 5., 4., 3., 2., 1.], + &[0., 0., 40., 9., 2., 4., 6., 8., 1., 3.], + &[0., 0., 55., 3., 1., 8., 6., 4., 2., 9.], + &[0., 0., 65., 2., 4., 7., 5., 3., 1., 8.], + ]) + .unwrap(); + let y = vec![10., 20., 30., 40., 55., 65.]; + + let parameters = ExtraTreesRegressorParameters::default() + .with_n_trees(100) + .with_seed(42); + + let regressor = ExtraTreesRegressor::fit(&x, &y, parameters).unwrap(); + let y_hat = regressor.predict(&x).unwrap(); + + assert_eq!(y_hat.len(), y.len()); + + let mse = mean_squared_error(&y, &y_hat); + + // The model should be able to learn this simple relationship perfectly, + // ignoring the noise features. The MSE should be very low. + assert!(mse < 1.0); + } + + #[test] + fn test_reproducibility() { + let x = DenseMatrix::from_2d_array(&[ + &[1., 2.], + &[3., 4.], + &[5., 6.], + &[7., 8.], + &[9., 10.], + &[11., 12.], + ]) + .unwrap(); + let y = vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0]; + + let params = ExtraTreesRegressorParameters::default().with_seed(42); + + let regressor1 = ExtraTreesRegressor::fit(&x, &y, params.clone()).unwrap(); + let y_hat1 = regressor1.predict(&x).unwrap(); + + let regressor2 = ExtraTreesRegressor::fit(&x, &y, params.clone()).unwrap(); + let y_hat2 = regressor2.predict(&x).unwrap(); + + assert_eq!(y_hat1, y_hat2); + } +} diff --git a/src/ensemble/forest_regressor.rs b/src/ensemble/forest_regressor.rs new file mode 100644 index 00000000..88d07bd7 --- /dev/null +++ b/src/ensemble/forest_regressor.rs @@ -0,0 +1,222 @@ +use rand::Rng; +use std::fmt::Debug; + +#[cfg(feature = "serde")] +use serde::{Deserialize, Serialize}; + +use crate::error::{Failed, FailedError}; +use crate::linalg::basic::arrays::{Array1, Array2}; +use crate::numbers::basenum::Number; +use crate::numbers::floatnum::FloatNumber; + +use crate::rand_custom::get_rng_impl; +use crate::tree::decision_tree_regressor::{ + DecisionTreeRegressor, DecisionTreeRegressorParameters, +}; + +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug, Clone)] +/// Parameters of the Forest Regressor +/// Some parameters here are passed directly into base estimator. +pub struct ForestRegressorParameters { + #[cfg_attr(feature = "serde", serde(default))] + /// Tree max depth. See [Decision Tree Regressor](../../tree/decision_tree_regressor/index.html) + pub max_depth: Option, + #[cfg_attr(feature = "serde", serde(default))] + /// The minimum number of samples required to be at a leaf node. See [Decision Tree Regressor](../../tree/decision_tree_regressor/index.html) + pub min_samples_leaf: usize, + #[cfg_attr(feature = "serde", serde(default))] + /// The minimum number of samples required to split an internal node. See [Decision Tree Regressor](../../tree/decision_tree_regressor/index.html) + pub min_samples_split: usize, + #[cfg_attr(feature = "serde", serde(default))] + /// The number of trees in the forest. + pub n_trees: usize, + #[cfg_attr(feature = "serde", serde(default))] + /// Number of random sample of predictors to use as split candidates. + pub m: Option, + #[cfg_attr(feature = "serde", serde(default))] + /// Whether to keep samples used for tree generation. This is required for OOB prediction. + pub keep_samples: bool, + #[cfg_attr(feature = "serde", serde(default))] + /// Seed used for bootstrap sampling and feature selection for each tree. + pub seed: u64, + #[cfg_attr(feature = "serde", serde(default))] + pub bootstrap: bool, + #[cfg_attr(feature = "serde", serde(default))] + pub use_random_splits: bool, +} + +impl, Y: Array1> PartialEq + for ForestRegressor +{ + fn eq(&self, other: &Self) -> bool { + if self.trees.as_ref().unwrap().len() != other.trees.as_ref().unwrap().len() { + false + } else { + self.trees + .iter() + .zip(other.trees.iter()) + .all(|(a, b)| a == b) + } + } +} + +/// Forest Regressor +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug)] +pub struct ForestRegressor< + TX: Number + FloatNumber + PartialOrd, + TY: Number, + X: Array2, + Y: Array1, +> { + trees: Option>>, + samples: Option>>, +} + +impl, Y: Array1> + ForestRegressor +{ + /// Build a forest of trees from the training set. + /// * `x` - _NxM_ matrix with _N_ observations and _M_ features in each observation. + /// * `y` - the target class values + pub fn fit( + x: &X, + y: &Y, + parameters: ForestRegressorParameters, + ) -> Result, Failed> { + let (n_rows, num_attributes) = x.shape(); + + if n_rows != y.shape() { + return Err(Failed::fit("Number of rows in X should = len(y)")); + } + + let mtry = parameters + .m + .unwrap_or((num_attributes as f64).sqrt().floor() as usize); + + let mut rng = get_rng_impl(Some(parameters.seed)); + let mut trees: Vec> = Vec::new(); + + let mut maybe_all_samples: Option>> = Option::None; + if parameters.keep_samples { + // TODO: use with_capacity here + maybe_all_samples = Some(Vec::new()); + } + + let mut samples: Vec = (0..n_rows).map(|_| 1).collect(); + + for _ in 0..parameters.n_trees { + if parameters.bootstrap { + samples = + ForestRegressor::::sample_with_replacement(n_rows, &mut rng); + } + + // keep samples is flag is on + if let Some(ref mut all_samples) = maybe_all_samples { + all_samples.push(samples.iter().map(|x| *x != 0).collect()) + } + + let params = DecisionTreeRegressorParameters { + max_depth: parameters.max_depth, + min_samples_leaf: parameters.min_samples_leaf, + min_samples_split: parameters.min_samples_split, + seed: Some(parameters.seed), + }; + let tree = DecisionTreeRegressor::fit_weak_learner( + x, + y, + samples.clone(), + mtry, + params, + parameters.use_random_splits, + )?; + trees.push(tree); + } + + Ok(ForestRegressor { + trees: Some(trees), + samples: maybe_all_samples, + }) + } + + /// Predict class for `x` + /// * `x` - _KxM_ data where _K_ is number of observations and _M_ is number of features. + pub fn predict(&self, x: &X) -> Result { + let mut result = Y::zeros(x.shape().0); + + let (n, _) = x.shape(); + + for i in 0..n { + result.set(i, self.predict_for_row(x, i)); + } + + Ok(result) + } + + fn predict_for_row(&self, x: &X, row: usize) -> TY { + let n_trees = self.trees.as_ref().unwrap().len(); + + let mut result = TY::zero(); + + for tree in self.trees.as_ref().unwrap().iter() { + result += tree.predict_for_row(x, row); + } + + result / TY::from_usize(n_trees).unwrap() + } + + /// Predict OOB classes for `x`. `x` is expected to be equal to the dataset used in training. + pub fn predict_oob(&self, x: &X) -> Result { + let (n, _) = x.shape(); + if self.samples.is_none() { + Err(Failed::because( + FailedError::PredictFailed, + "Need samples=true for OOB predictions.", + )) + } else if self.samples.as_ref().unwrap()[0].len() != n { + Err(Failed::because( + FailedError::PredictFailed, + "Prediction matrix must match matrix used in training for OOB predictions.", + )) + } else { + let mut result = Y::zeros(n); + + for i in 0..n { + result.set(i, self.predict_for_row_oob(x, i)); + } + + Ok(result) + } + } + + fn predict_for_row_oob(&self, x: &X, row: usize) -> TY { + let mut n_trees = 0; + let mut result = TY::zero(); + + for (tree, samples) in self + .trees + .as_ref() + .unwrap() + .iter() + .zip(self.samples.as_ref().unwrap()) + { + if !samples[row] { + result += tree.predict_for_row(x, row); + n_trees += 1; + } + } + + // TODO: What to do if there are no oob trees? + result / TY::from(n_trees).unwrap() + } + + fn sample_with_replacement(nrows: usize, rng: &mut impl Rng) -> Vec { + let mut samples = vec![0; nrows]; + for _ in 0..nrows { + let xi = rng.gen_range(0..nrows); + samples[xi] += 1; + } + samples + } +} diff --git a/src/ensemble/mod.rs b/src/ensemble/mod.rs index 8cebd5c5..45c35daf 100644 --- a/src/ensemble/mod.rs +++ b/src/ensemble/mod.rs @@ -16,6 +16,8 @@ //! //! * ["An Introduction to Statistical Learning", James G., Witten D., Hastie T., Tibshirani R., 8.2 Bagging, Random Forests, Boosting](http://faculty.marshall.usc.edu/gareth-james/ISL/) +pub mod extra_trees_regressor; +mod forest_regressor; /// Random forest classifier pub mod random_forest_classifier; /// Random forest regressor diff --git a/src/ensemble/random_forest_regressor.rs b/src/ensemble/random_forest_regressor.rs index efc63d3d..b597b5b2 100644 --- a/src/ensemble/random_forest_regressor.rs +++ b/src/ensemble/random_forest_regressor.rs @@ -43,7 +43,6 @@ //! //! -use rand::Rng; use std::default::Default; use std::fmt::Debug; @@ -51,15 +50,13 @@ use std::fmt::Debug; use serde::{Deserialize, Serialize}; use crate::api::{Predictor, SupervisedEstimator}; -use crate::error::{Failed, FailedError}; +use crate::ensemble::forest_regressor::ForestRegressorParameters; +use crate::error::Failed; use crate::linalg::basic::arrays::{Array1, Array2}; use crate::numbers::basenum::Number; use crate::numbers::floatnum::FloatNumber; -use crate::rand_custom::get_rng_impl; -use crate::tree::decision_tree_regressor::{ - DecisionTreeRegressor, DecisionTreeRegressorParameters, -}; +use super::forest_regressor::ForestRegressor; #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] #[derive(Debug, Clone)] @@ -98,8 +95,7 @@ pub struct RandomForestRegressor< X: Array2, Y: Array1, > { - trees: Option>>, - samples: Option>>, + forest_regressor: Option>, } impl RandomForestRegressorParameters { @@ -159,14 +155,7 @@ impl, Y: Array1 for RandomForestRegressor { fn eq(&self, other: &Self) -> bool { - if self.trees.as_ref().unwrap().len() != other.trees.as_ref().unwrap().len() { - false - } else { - self.trees - .iter() - .zip(other.trees.iter()) - .all(|(a, b)| a == b) - } + self.forest_regressor == other.forest_regressor } } @@ -176,8 +165,7 @@ impl, Y: Array1 { fn new() -> Self { Self { - trees: Option::None, - samples: Option::None, + forest_regressor: Option::None, } } @@ -397,128 +385,35 @@ impl, Y: Array1 y: &Y, parameters: RandomForestRegressorParameters, ) -> Result, Failed> { - let (n_rows, num_attributes) = x.shape(); - - if n_rows != y.shape() { - return Err(Failed::fit("Number of rows in X should = len(y)")); - } - - let mtry = parameters - .m - .unwrap_or((num_attributes as f64).sqrt().floor() as usize); - - let mut rng = get_rng_impl(Some(parameters.seed)); - let mut trees: Vec> = Vec::new(); - - let mut maybe_all_samples: Option>> = Option::None; - if parameters.keep_samples { - // TODO: use with_capacity here - maybe_all_samples = Some(Vec::new()); - } - - for _ in 0..parameters.n_trees { - let samples: Vec = - RandomForestRegressor::::sample_with_replacement(n_rows, &mut rng); - - // keep samples is flag is on - if let Some(ref mut all_samples) = maybe_all_samples { - all_samples.push(samples.iter().map(|x| *x != 0).collect()) - } - - let params = DecisionTreeRegressorParameters { - max_depth: parameters.max_depth, - min_samples_leaf: parameters.min_samples_leaf, - min_samples_split: parameters.min_samples_split, - seed: Some(parameters.seed), - }; - let tree = DecisionTreeRegressor::fit_weak_learner(x, y, samples, mtry, params)?; - trees.push(tree); - } + let regressor_params = ForestRegressorParameters { + max_depth: parameters.max_depth, + min_samples_leaf: parameters.min_samples_leaf, + min_samples_split: parameters.min_samples_split, + n_trees: parameters.n_trees, + m: parameters.m, + keep_samples: parameters.keep_samples, + seed: parameters.seed, + bootstrap: true, + use_random_splits: false, + }; + let forest_regressor = ForestRegressor::fit(x, y, regressor_params)?; Ok(RandomForestRegressor { - trees: Some(trees), - samples: maybe_all_samples, + forest_regressor: Some(forest_regressor), }) } /// Predict class for `x` /// * `x` - _KxM_ data where _K_ is number of observations and _M_ is number of features. pub fn predict(&self, x: &X) -> Result { - let mut result = Y::zeros(x.shape().0); - - let (n, _) = x.shape(); - - for i in 0..n { - result.set(i, self.predict_for_row(x, i)); - } - - Ok(result) - } - - fn predict_for_row(&self, x: &X, row: usize) -> TY { - let n_trees = self.trees.as_ref().unwrap().len(); - - let mut result = TY::zero(); - - for tree in self.trees.as_ref().unwrap().iter() { - result += tree.predict_for_row(x, row); - } - - result / TY::from_usize(n_trees).unwrap() + let forest_regressor = self.forest_regressor.as_ref().unwrap(); + forest_regressor.predict(x) } /// Predict OOB classes for `x`. `x` is expected to be equal to the dataset used in training. pub fn predict_oob(&self, x: &X) -> Result { - let (n, _) = x.shape(); - if self.samples.is_none() { - Err(Failed::because( - FailedError::PredictFailed, - "Need samples=true for OOB predictions.", - )) - } else if self.samples.as_ref().unwrap()[0].len() != n { - Err(Failed::because( - FailedError::PredictFailed, - "Prediction matrix must match matrix used in training for OOB predictions.", - )) - } else { - let mut result = Y::zeros(n); - - for i in 0..n { - result.set(i, self.predict_for_row_oob(x, i)); - } - - Ok(result) - } - } - - fn predict_for_row_oob(&self, x: &X, row: usize) -> TY { - let mut n_trees = 0; - let mut result = TY::zero(); - - for (tree, samples) in self - .trees - .as_ref() - .unwrap() - .iter() - .zip(self.samples.as_ref().unwrap()) - { - if !samples[row] { - result += tree.predict_for_row(x, row); - n_trees += 1; - } - } - - // TODO: What to do if there are no oob trees? - result / TY::from(n_trees).unwrap() - } - - fn sample_with_replacement(nrows: usize, rng: &mut impl Rng) -> Vec { - let mut samples = vec![0; nrows]; - for _ in 0..nrows { - let xi = rng.gen_range(0..nrows); - samples[xi] += 1; - } - samples + let forest_regressor = self.forest_regressor.as_ref().unwrap(); + forest_regressor.predict_oob(x) } } diff --git a/src/lib.rs b/src/lib.rs index c68368fa..c6f9349c 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -130,6 +130,5 @@ pub mod readers; pub mod svm; /// Supervised tree-based learning methods pub mod tree; -pub mod xgboost; pub(crate) mod rand_custom; diff --git a/src/tree/decision_tree_regressor.rs b/src/tree/decision_tree_regressor.rs index d735697d..af4e2748 100644 --- a/src/tree/decision_tree_regressor.rs +++ b/src/tree/decision_tree_regressor.rs @@ -426,7 +426,7 @@ impl, Y: Array1> } let samples = vec![1; x_nrows]; - DecisionTreeRegressor::fit_weak_learner(x, y, samples, num_attributes, parameters) + DecisionTreeRegressor::fit_weak_learner(x, y, samples, num_attributes, parameters, false) } pub(crate) fn fit_weak_learner( @@ -435,6 +435,7 @@ impl, Y: Array1> samples: Vec, mtry: usize, parameters: DecisionTreeRegressorParameters, + use_random_splits: bool, ) -> Result, Failed> { let y_m = y.clone(); @@ -474,13 +475,15 @@ impl, Y: Array1> let mut visitor_queue: LinkedList> = LinkedList::new(); - if tree.find_best_cutoff(&mut visitor, mtry, &mut rng) { + if tree.find_best_cutoff(&mut visitor, mtry, &mut rng, use_random_splits) { visitor_queue.push_back(visitor); } while tree.depth() < tree.parameters().max_depth.unwrap_or(u16::MAX) { match visitor_queue.pop_front() { - Some(node) => tree.split(node, mtry, &mut visitor_queue, &mut rng), + Some(node) => { + tree.split(node, mtry, &mut visitor_queue, &mut rng, use_random_splits) + } None => break, }; } @@ -534,6 +537,7 @@ impl, Y: Array1> visitor: &mut NodeVisitor<'_, TX, TY, X, Y>, mtry: usize, rng: &mut impl Rng, + use_random_splits: bool, ) -> bool { let (_, n_attr) = visitor.x.shape(); @@ -555,7 +559,15 @@ impl, Y: Array1> n as f64 * self.nodes()[visitor.node].output * self.nodes()[visitor.node].output; for variable in variables.iter().take(mtry) { - self.find_best_split(visitor, n, sum, parent_gain, *variable); + self.find_best_split( + visitor, + n, + sum, + parent_gain, + *variable, + rng, + use_random_splits, + ); } self.nodes()[visitor.node].split_score.is_some() @@ -568,65 +580,137 @@ impl, Y: Array1> sum: f64, parent_gain: f64, j: usize, + rng: &mut impl Rng, + use_random_splits: bool, ) { - let mut true_sum = 0f64; - let mut true_count = 0; - let mut prevx = Option::None; - - for i in visitor.order[j].iter() { - if visitor.samples[*i] > 0 { - let x_ij = *visitor.x.get((*i, j)); - - if prevx.is_none() || x_ij == prevx.unwrap() { - prevx = Some(x_ij); - true_count += visitor.samples[*i]; - true_sum += visitor.samples[*i] as f64 * visitor.y.get(*i).to_f64().unwrap(); - continue; + if use_random_splits { + // --- EFFICIENT LOGIC for Continuous Random Splits --- + let (min_val, max_val) = { + let mut min_opt = None; + let mut max_opt = None; + for &i in &visitor.order[j] { + if visitor.samples[i] > 0 { + min_opt = Some(*visitor.x.get((i, j))); + break; + } + } + for &i in visitor.order[j].iter().rev() { + if visitor.samples[i] > 0 { + max_opt = Some(*visitor.x.get((i, j))); + break; + } } + if min_opt.is_none() { + return; + } + (min_opt.unwrap(), max_opt.unwrap()) + }; - let false_count = n - true_count; + if min_val >= max_val { + return; + } - if true_count < self.parameters().min_samples_leaf - || false_count < self.parameters().min_samples_leaf - { - prevx = Some(x_ij); - true_count += visitor.samples[*i]; - true_sum += visitor.samples[*i] as f64 * visitor.y.get(*i).to_f64().unwrap(); - continue; - } + let split_value = rng.gen_range(min_val.to_f64().unwrap()..max_val.to_f64().unwrap()); - let true_mean = true_sum / true_count as f64; - let false_mean = (sum - true_sum) / false_count as f64; + let mut true_sum = 0f64; + let mut true_count = 0; + for &i in &visitor.order[j] { + if visitor.samples[i] > 0 { + if visitor.x.get((i, j)).to_f64().unwrap() <= split_value { + true_sum += visitor.samples[i] as f64 * visitor.y.get(i).to_f64().unwrap(); + true_count += visitor.samples[i]; + } else { + break; + } + } + } - let gain = (true_count as f64 * true_mean * true_mean - + false_count as f64 * false_mean * false_mean) - - parent_gain; + let false_count = n - true_count; - if self.nodes()[visitor.node].split_score.is_none() - || gain > self.nodes()[visitor.node].split_score.unwrap() - { - self.nodes[visitor.node].split_feature = j; - self.nodes[visitor.node].split_value = - Option::Some((x_ij + prevx.unwrap()).to_f64().unwrap() / 2f64); - self.nodes[visitor.node].split_score = Option::Some(gain); + if true_count < self.parameters().min_samples_leaf + || false_count < self.parameters().min_samples_leaf + { + return; + } - visitor.true_child_output = true_mean; - visitor.false_child_output = false_mean; + let true_mean = if true_count > 0 { + true_sum / true_count as f64 + } else { + 0.0 + }; + let false_mean = if false_count > 0 { + (sum - true_sum) / false_count as f64 + } else { + 0.0 + }; + let gain = (true_count as f64 * true_mean * true_mean + + false_count as f64 * false_mean * false_mean) + - parent_gain; + + if self.nodes[visitor.node].split_score.is_none() + || gain > self.nodes[visitor.node].split_score.unwrap() + { + self.nodes[visitor.node].split_feature = j; + self.nodes[visitor.node].split_value = Some(split_value); + self.nodes[visitor.node].split_score = Some(gain); + visitor.true_child_output = true_mean; + visitor.false_child_output = false_mean; + } + } else { + let mut true_sum = 0f64; + let mut true_count = 0; + let mut prevx = Option::None; + let order = &visitor.order[j]; + + for i in order.iter() { + if visitor.samples[*i] > 0 { + let x_ij = *visitor.x.get((*i, j)); + if prevx.is_none() || x_ij == prevx.unwrap() { + prevx = Some(x_ij); + true_count += visitor.samples[*i]; + true_sum += + visitor.samples[*i] as f64 * visitor.y.get(*i).to_f64().unwrap(); + continue; + } + let false_count = n - true_count; + if true_count < self.parameters().min_samples_leaf + || false_count < self.parameters().min_samples_leaf + { + prevx = Some(x_ij); + true_count += visitor.samples[*i]; + true_sum += + visitor.samples[*i] as f64 * visitor.y.get(*i).to_f64().unwrap(); + continue; + } + let true_mean = true_sum / true_count as f64; + let false_mean = (sum - true_sum) / false_count as f64; + let gain = (true_count as f64 * true_mean * true_mean + + false_count as f64 * false_mean * false_mean) + - parent_gain; + if self.nodes[visitor.node].split_score.is_none() + || gain > self.nodes[visitor.node].split_score.unwrap() + { + self.nodes[visitor.node].split_feature = j; + self.nodes[visitor.node].split_value = + Option::Some((x_ij + prevx.unwrap()).to_f64().unwrap() / 2f64); + self.nodes[visitor.node].split_score = Some(gain); + visitor.true_child_output = true_mean; + visitor.false_child_output = false_mean; + } + prevx = Some(x_ij); + true_sum += visitor.samples[*i] as f64 * visitor.y.get(*i).to_f64().unwrap(); + true_count += visitor.samples[*i]; } - - prevx = Some(x_ij); - true_sum += visitor.samples[*i] as f64 * visitor.y.get(*i).to_f64().unwrap(); - true_count += visitor.samples[*i]; } } } - fn split<'a>( &mut self, mut visitor: NodeVisitor<'a, TX, TY, X, Y>, mtry: usize, visitor_queue: &mut LinkedList>, rng: &mut impl Rng, + use_random_splits: bool, ) -> bool { let (n, _) = visitor.x.shape(); let mut tc = 0; @@ -679,7 +763,7 @@ impl, Y: Array1> visitor.level + 1, ); - if self.find_best_cutoff(&mut true_visitor, mtry, rng) { + if self.find_best_cutoff(&mut true_visitor, mtry, rng, use_random_splits) { visitor_queue.push_back(true_visitor); } @@ -692,7 +776,7 @@ impl, Y: Array1> visitor.level + 1, ); - if self.find_best_cutoff(&mut false_visitor, mtry, rng) { + if self.find_best_cutoff(&mut false_visitor, mtry, rng, use_random_splits) { visitor_queue.push_back(false_visitor); } diff --git a/src/xgboost/mod.rs b/src/xgboost/mod.rs deleted file mode 100644 index 68e087be..00000000 --- a/src/xgboost/mod.rs +++ /dev/null @@ -1,16 +0,0 @@ -//! # XGBoost -//! -//! XGBoost, which stands for Extreme Gradient Boosting, is a powerful and efficient implementation of the gradient boosting framework. Gradient boosting is a machine learning technique for regression and classification problems, which produces a prediction model in the form of an ensemble of weak prediction models, typically decision trees. -//! -//! The core idea of boosting is to build the model in a stage-wise fashion. It learns from its mistakes by sequentially adding new models that correct the errors of the previous ones. Unlike bagging, which trains models in parallel, boosting is a sequential process. Each new tree is fit on a modified version of the original data set, specifically focusing on the instances where the previous models performed poorly. -//! -//! XGBoost enhances this process through several key innovations. It employs a more regularized model formalization to control over-fitting, which gives it better performance. It also has a highly optimized and parallelized tree construction process, making it significantly faster and more scalable than traditional gradient boosting implementations. -//! -//! ## References: -//! -//! * "Elements of Statistical Learning", Hastie T., Tibshirani R., Friedman J., 10. Boosting and Additive Trees -//! * XGBoost: A Scalable Tree Boosting System, Chen T., Guestrin C. - -// xgboost implementation -pub mod xgb_regressor; -pub use xgb_regressor::{XGRegressor, XGRegressorParameters}; diff --git a/src/xgboost/xgb_regressor.rs b/src/xgboost/xgb_regressor.rs deleted file mode 100644 index ac6ec752..00000000 --- a/src/xgboost/xgb_regressor.rs +++ /dev/null @@ -1,762 +0,0 @@ -//! # Extreme Gradient Boosting (XGBoost) -//! -//! XGBoost is a highly efficient and effective implementation of the gradient boosting framework. -//! Like other boosting models, it builds an ensemble of sequential decision trees, where each new tree -//! is trained to correct the errors of the previous ones. -//! -//! What makes XGBoost powerful is its use of both the first and second derivatives (gradient and hessian) -//! of the loss function, which allows for more accurate approximations and faster convergence. It also -//! includes built-in regularization techniques (L1/`alpha` and L2/`lambda`) to prevent overfitting. -//! -//! This implementation was ported to Rust from the concepts and algorithm explained in the blog post -//! ["XGBoost from Scratch"](https://randomrealizations.com/posts/xgboost-from-scratch/). It is designed -//! to be a general-purpose regressor that can be used with any objective function that provides a gradient -//! and a hessian. -//! -//! Example: -//! -//! ``` -//! use smartcore::linalg::basic::matrix::DenseMatrix; -//! use smartcore::xgboost::{XGRegressor, XGRegressorParameters}; -//! -//! // Simple dataset: predict y = 2*x -//! let x = DenseMatrix::from_2d_array(&[ -//! &[1.0], &[2.0], &[3.0], &[4.0], &[5.0] -//! ]).unwrap(); -//! let y = vec![2.0, 4.0, 6.0, 8.0, 10.0]; -//! -//! // Use default parameters, but set a few for demonstration -//! let parameters = XGRegressorParameters::default() -//! .with_n_estimators(50) -//! .with_max_depth(3) -//! .with_learning_rate(0.1); -//! -//! // Train the model -//! let model = XGRegressor::fit(&x, &y, parameters).unwrap(); -//! -//! // Make predictions -//! let x_test = DenseMatrix::from_2d_array(&[&[6.0], &[7.0]]).unwrap(); -//! let y_hat = model.predict(&x_test).unwrap(); -//! -//! // y_hat should be close to [12.0, 14.0] -//! ``` -//! - -use rand::{seq::SliceRandom, Rng}; -use std::{iter::zip, marker::PhantomData}; - -use crate::{ - api::{PredictorBorrow, SupervisedEstimatorBorrow}, - error::{Failed, FailedError}, - linalg::basic::arrays::{Array1, Array2}, - numbers::basenum::Number, - rand_custom::get_rng_impl, -}; - -/// Defines the objective function to be optimized. -/// The objective function provides the loss, gradient (first derivative), and -/// hessian (second derivative) required for the XGBoost algorithm. -#[derive(Clone, Debug)] -pub enum Objective { - /// The objective for regression tasks using Mean Squared Error. - /// Loss: 0.5 * (y_true - y_pred)^2 - MeanSquaredError, -} - -impl Objective { - /// Calculates the loss for each sample given the true and predicted values. - /// - /// # Arguments - /// * `y_true` - A vector of the true target values. - /// * `y_pred` - A vector of the predicted values. - /// - /// # Returns - /// The mean of the calculated loss values. - pub fn loss_function>(&self, y_true: &Y, y_pred: &Vec) -> f64 { - match self { - Objective::MeanSquaredError => { - zip(y_true.iterator(0), y_pred) - .map(|(true_val, pred_val)| { - 0.5 * (true_val.to_f64().unwrap() - pred_val).powi(2) - }) - .sum::() - / y_true.shape() as f64 - } - } - } - - /// Calculates the gradient (first derivative) of the loss function. - /// - /// # Arguments - /// * `y_true` - A vector of the true target values. - /// * `y_pred` - A vector of the predicted values. - /// - /// # Returns - /// A vector of gradients for each sample. - pub fn gradient>(&self, y_true: &Y, y_pred: &Vec) -> Vec { - match self { - Objective::MeanSquaredError => zip(y_true.iterator(0), y_pred) - .map(|(true_val, pred_val)| (*pred_val - true_val.to_f64().unwrap())) - .collect(), - } - } - - /// Calculates the hessian (second derivative) of the loss function. - /// - /// # Arguments - /// * `y_true` - A vector of the true target values. - /// * `y_pred` - A vector of the predicted values. - /// - /// # Returns - /// A vector of hessians for each sample. - #[allow(unused_variables)] - pub fn hessian>(&self, y_true: &Y, y_pred: &[f64]) -> Vec { - match self { - Objective::MeanSquaredError => vec![1.0; y_true.shape()], - } - } -} - -/// Represents a single decision tree in the XGBoost ensemble. -/// -/// This is a recursive data structure where each `TreeRegressor` is a node -/// that can have a left and a right child, also of type `TreeRegressor`. -#[allow(dead_code)] -struct TreeRegressor, Y: Array1> { - left: Option>>, - right: Option>>, - /// The output value of this node. If it's a leaf, this is the final prediction. - value: f64, - /// The feature value threshold used to split this node. - threshold: f64, - /// The index of the feature used for splitting. - split_feature_idx: usize, - /// The gain in score achieved by this split. - split_score: f64, - _phantom_tx: PhantomData, - _phantom_ty: PhantomData, - _phantom_x: PhantomData, - _phantom_y: PhantomData, -} - -impl, Y: Array1> - TreeRegressor -{ - /// Recursively builds a decision tree (a `TreeRegressor` node). - /// - /// This function determines the optimal split for the given set of samples (`idxs`) - /// and then recursively calls itself to build the left and right child nodes. - /// - /// # Arguments - /// * `data` - The full training dataset. - /// * `g` - Gradients for all samples. - /// * `h` - Hessians for all samples. - /// * `idxs` - The indices of the samples belonging to the current node. - /// * `max_depth` - The maximum remaining depth for this branch. - /// * `min_child_weight` - The minimum sum of hessians required in a child node. - /// * `lambda` - L2 regularization term on weights. - /// * `gamma` - Minimum loss reduction required to make a further partition. - pub fn fit( - data: &X, - g: &Vec, - h: &Vec, - idxs: &[usize], - max_depth: u16, - min_child_weight: f64, - lambda: f64, - gamma: f64, - ) -> Self { - let g_sum = idxs.iter().map(|&i| g[i]).sum::(); - let h_sum = idxs.iter().map(|&i| h[i]).sum::(); - let value = -g_sum / (h_sum + lambda); - - let mut best_feature_idx = usize::MAX; - let mut best_split_score = 0.0; - let mut best_threshold = 0.0; - let mut left = Option::None; - let mut right = Option::None; - - if max_depth > 0 { - Self::insert_child_nodes( - data, - g, - h, - idxs, - &mut best_feature_idx, - &mut best_split_score, - &mut best_threshold, - &mut left, - &mut right, - max_depth, - min_child_weight, - lambda, - gamma, - ); - } - - Self { - left, - right, - value, - threshold: best_threshold, - split_feature_idx: best_feature_idx, - split_score: best_split_score, - _phantom_tx: PhantomData, - _phantom_ty: PhantomData, - _phantom_x: PhantomData, - _phantom_y: PhantomData, - } - } - - /// Finds the best split and creates child nodes if a valid split is found. - fn insert_child_nodes( - data: &X, - g: &Vec, - h: &Vec, - idxs: &[usize], - best_feature_idx: &mut usize, - best_split_score: &mut f64, - best_threshold: &mut f64, - left: &mut Option>, - right: &mut Option>, - max_depth: u16, - min_child_weight: f64, - lambda: f64, - gamma: f64, - ) { - let (_, n_features) = data.shape(); - for i in 0..n_features { - Self::find_best_split( - data, - g, - h, - idxs, - i, - best_feature_idx, - best_split_score, - best_threshold, - min_child_weight, - lambda, - gamma, - ); - } - - // A split is only valid if it results in a positive gain. - if *best_split_score > 0.0 { - let mut left_idxs = Vec::new(); - let mut right_idxs = Vec::new(); - for idx in idxs.iter() { - if data.get((*idx, *best_feature_idx)).to_f64().unwrap() <= *best_threshold { - left_idxs.push(*idx); - } else { - right_idxs.push(*idx); - } - } - - *left = Some(Box::new(TreeRegressor::fit( - data, - g, - h, - &left_idxs, - max_depth - 1, - min_child_weight, - lambda, - gamma, - ))); - *right = Some(Box::new(TreeRegressor::fit( - data, - g, - h, - &right_idxs, - max_depth - 1, - min_child_weight, - lambda, - gamma, - ))); - } - } - - /// Iterates through a single feature to find the best possible split point. - fn find_best_split( - data: &X, - g: &[f64], - h: &[f64], - idxs: &[usize], - feature_idx: usize, - best_feature_idx: &mut usize, - best_split_score: &mut f64, - best_threshold: &mut f64, - min_child_weight: f64, - lambda: f64, - gamma: f64, - ) { - let mut sorted_idxs = idxs.to_owned(); - sorted_idxs.sort_by(|a, b| { - data.get((*a, feature_idx)) - .partial_cmp(data.get((*b, feature_idx))) - .unwrap() - }); - - let sum_g = sorted_idxs.iter().map(|&i| g[i]).sum::(); - let sum_h = sorted_idxs.iter().map(|&i| h[i]).sum::(); - - let mut sum_g_right = sum_g; - let mut sum_h_right = sum_h; - let mut sum_g_left = 0.0; - let mut sum_h_left = 0.0; - - for i in 0..sorted_idxs.len() - 1 { - let idx = sorted_idxs[i]; - let next_idx = sorted_idxs[i + 1]; - - let g_i = g[idx]; - let h_i = h[idx]; - let x_i = data.get((idx, feature_idx)).to_f64().unwrap(); - let x_i_next = data.get((next_idx, feature_idx)).to_f64().unwrap(); - - sum_g_left += g_i; - sum_h_left += h_i; - sum_g_right -= g_i; - sum_h_right -= h_i; - - if sum_h_left < min_child_weight || x_i == x_i_next { - continue; - } - if sum_h_right < min_child_weight { - break; - } - - let gain = 0.5 - * ((sum_g_left * sum_g_left / (sum_h_left + lambda)) - + (sum_g_right * sum_g_right / (sum_h_right + lambda)) - - (sum_g * sum_g / (sum_h + lambda))) - - gamma; - - if gain > *best_split_score { - *best_split_score = gain; - *best_threshold = (x_i + x_i_next) / 2.0; - *best_feature_idx = feature_idx; - } - } - } - - /// Predicts the output values for a dataset. - pub fn predict(&self, data: &X) -> Vec { - let (n_samples, n_features) = data.shape(); - (0..n_samples) - .map(|i| { - self.predict_for_row(&Vec::from_iterator( - data.get_row(i).iterator(0).copied(), - n_features, - )) - }) - .collect() - } - - /// Predicts the output value for a single row of data by traversing the tree. - pub fn predict_for_row(&self, row: &Vec) -> f64 { - // A leaf node is identified by having no children. - if self.left.is_none() { - return self.value; - } - - // Recurse down the appropriate branch. - let child = if row[self.split_feature_idx].to_f64().unwrap() <= self.threshold { - self.left.as_ref().unwrap() - } else { - self.right.as_ref().unwrap() - }; - - child.predict_for_row(row) - } -} - -/// Parameters for the `jRegressor` model. -/// -/// This struct holds all the hyperparameters that control the training process. -#[derive(Clone, Debug)] -pub struct XGRegressorParameters { - /// The number of boosting rounds or trees to build. - pub n_estimators: usize, - /// The maximum depth of each tree. - pub max_depth: u16, - /// Step size shrinkage used to prevent overfitting. - pub learning_rate: f64, - /// Minimum sum of instance weight (hessian) needed in a child. - pub min_child_weight: usize, - /// L2 regularization term on weights. - pub lambda: f64, - /// Minimum loss reduction required to make a further partition on a leaf node. - pub gamma: f64, - /// The initial prediction score for all instances. - pub base_score: f64, - /// The fraction of samples to be used for fitting the individual base learners. - pub subsample: f64, - /// The seed for the random number generator for reproducibility. - pub seed: u64, - /// The objective function to be optimized. - pub objective: Objective, -} - -impl Default for XGRegressorParameters { - /// Creates a new set of `XGRegressorParameters` with default values. - fn default() -> Self { - Self { - n_estimators: 100, - learning_rate: 0.3, - max_depth: 6, - min_child_weight: 1, - lambda: 1.0, - gamma: 0.0, - base_score: 0.5, - subsample: 1.0, - seed: 0, - objective: Objective::MeanSquaredError, - } - } -} - -// Builder pattern for XGRegressorParameters -impl XGRegressorParameters { - /// Sets the number of boosting rounds or trees to build. - pub fn with_n_estimators(mut self, n_estimators: usize) -> Self { - self.n_estimators = n_estimators; - self - } - - /// Sets the step size shrinkage used to prevent overfitting. - /// - /// Also known as `eta`. A smaller value makes the model more robust by preventing - /// too much weight being given to any single tree. - pub fn with_learning_rate(mut self, learning_rate: f64) -> Self { - self.learning_rate = learning_rate; - self - } - - /// Sets the maximum depth of each individual tree. - // A lower value helps prevent overfitting.* - pub fn with_max_depth(mut self, max_depth: u16) -> Self { - self.max_depth = max_depth; - self - } - - /// Sets the minimum sum of instance weight (hessian) needed in a child node. - /// - /// If the tree partition step results in a leaf node with the sum of - // instance weight less than `min_child_weight`, then the building process* - /// will give up further partitioning. - pub fn with_min_child_weight(mut self, min_child_weight: usize) -> Self { - self.min_child_weight = min_child_weight; - self - } - - /// Sets the L2 regularization term on weights (`lambda`). - /// - /// Increasing this value will make the model more conservative. - pub fn with_lambda(mut self, lambda: f64) -> Self { - self.lambda = lambda; - self - } - - /// Sets the minimum loss reduction required to make a further partition on a leaf node. - /// - /// The larger `gamma` is, the more conservative the algorithm will be. - pub fn with_gamma(mut self, gamma: f64) -> Self { - self.gamma = gamma; - self - } - - /// Sets the initial prediction score for all instances. - pub fn with_base_score(mut self, base_score: f64) -> Self { - self.base_score = base_score; - self - } - - /// Sets the fraction of samples to be used for fitting individual base learners. - /// - /// A value of less than 1.0 introduces randomness and helps prevent overfitting. - pub fn with_subsample(mut self, subsample: f64) -> Self { - self.subsample = subsample; - self - } - - /// Sets the seed for the random number generator for reproducibility. - pub fn with_seed(mut self, seed: u64) -> Self { - self.seed = seed; - self - } - - /// Sets the objective function to be optimized during training. - pub fn with_objective(mut self, objective: Objective) -> Self { - self.objective = objective; - self - } -} - -/// An Extreme Gradient Boosting (XGBoost) model for regression and classification tasks. -pub struct XGRegressor, Y: Array1> { - regressors: Option>>, - parameters: Option, - _phantom_ty: PhantomData, - _phantom_tx: PhantomData, - _phantom_y: PhantomData, - _phantom_x: PhantomData, -} - -impl, Y: Array1> XGRegressor { - /// Fits the XGBoost model to the training data. - pub fn fit(data: &X, y: &Y, parameters: XGRegressorParameters) -> Result { - if parameters.subsample > 1.0 || parameters.subsample <= 0.0 { - return Err(Failed::because( - FailedError::ParametersError, - "Subsample ratio must be in (0, 1].", - )); - } - - let (n_samples, _) = data.shape(); - let learning_rate = parameters.learning_rate; - let mut predictions = vec![parameters.base_score; n_samples]; - - let mut regressors = Vec::new(); - let mut rng = get_rng_impl(Some(parameters.seed)); - - for _ in 0..parameters.n_estimators { - let gradients = parameters.objective.gradient(y, &predictions); - let hessians = parameters.objective.hessian(y, &predictions); - - let sample_idxs = if parameters.subsample < 1.0 { - Self::sample_without_replacement(n_samples, parameters.subsample, &mut rng) - } else { - (0..n_samples).collect::>() - }; - - let regressor = TreeRegressor::fit( - data, - &gradients, - &hessians, - &sample_idxs, - parameters.max_depth, - parameters.min_child_weight as f64, - parameters.lambda, - parameters.gamma, - ); - - let corrections = regressor.predict(data); - predictions = zip(predictions, corrections) - .map(|(pred, correction)| pred + (learning_rate * correction)) - .collect(); - - regressors.push(regressor); - } - - Ok(Self { - regressors: Some(regressors), - parameters: Some(parameters), - _phantom_ty: PhantomData, - _phantom_y: PhantomData, - _phantom_tx: PhantomData, - _phantom_x: PhantomData, - }) - } - - /// Predicts target values for the given input data. - pub fn predict(&self, data: &X) -> Result, Failed> { - let (n_samples, _) = data.shape(); - - let parameters = self.parameters.as_ref().unwrap(); - let mut predictions = vec![parameters.base_score; n_samples]; - let regressors = self.regressors.as_ref().unwrap(); - - for regressor in regressors.iter() { - let corrections = regressor.predict(data); - predictions = zip(predictions, corrections) - .map(|(pred, correction)| pred + (parameters.learning_rate * correction)) - .collect(); - } - - Ok(predictions - .into_iter() - .map(|p| TX::from_f64(p).unwrap()) - .collect()) - } - - /// Creates a random sample of indices without replacement. - fn sample_without_replacement( - population_size: usize, - subsample_ratio: f64, - rng: &mut impl Rng, - ) -> Vec { - let mut indices: Vec = (0..population_size).collect(); - indices.shuffle(rng); - indices.truncate((population_size as f64 * subsample_ratio) as usize); - indices - } -} - -// Boilerplate implementation for the smartcore traits -impl, Y: Array1> - SupervisedEstimatorBorrow<'_, X, Y, XGRegressorParameters> for XGRegressor -{ - fn new() -> Self { - Self { - regressors: None, - parameters: None, - _phantom_ty: PhantomData, - _phantom_y: PhantomData, - _phantom_tx: PhantomData, - _phantom_x: PhantomData, - } - } - - fn fit(x: &X, y: &Y, parameters: &XGRegressorParameters) -> Result { - XGRegressor::fit(x, y, parameters.clone()) - } -} - -impl, Y: Array1> PredictorBorrow<'_, X, TX> - for XGRegressor -{ - fn predict(&self, x: &X) -> Result, Failed> { - self.predict(x) - } -} - -// ------------------- TESTS ------------------- -#[cfg(test)] -mod tests { - use super::*; - use crate::linalg::basic::{arrays::Array, matrix::DenseMatrix}; - - /// Tests the gradient and hessian calculations for MeanSquaredError. - #[test] - fn test_mse_objective() { - let objective = Objective::MeanSquaredError; - let y_true = vec![1.0, 2.0, 3.0]; - let y_pred = vec![1.5, 2.5, 2.5]; - - let gradients = objective.gradient(&y_true, &y_pred); - let hessians = objective.hessian(&y_true, &y_pred); - - // Gradients should be (pred - true) - assert_eq!(gradients, vec![0.5, 0.5, -0.5]); - // Hessians should be all 1.0 for MSE - assert_eq!(hessians, vec![1.0, 1.0, 1.0]); - } - - #[test] - fn test_find_best_split_multidimensional() { - // Data has two features. The second feature is a better predictor. - let data = vec![ - vec![1.0, 10.0], // g = -0.5 - vec![1.0, 20.0], // g = -1.0 - vec![1.0, 30.0], // g = 1.0 - vec![1.0, 40.0], // g = 1.5 - ]; - let data = DenseMatrix::from_2d_vec(&data).unwrap(); - let g = vec![-0.5, -1.0, 1.0, 1.5]; - let h = vec![1.0, 1.0, 1.0, 1.0]; - let idxs = (0..4).collect::>(); - - let mut best_feature_idx = usize::MAX; - let mut best_split_score = 0.0; - let mut best_threshold = 0.0; - - // Manually calculated expected gain for the best split (on feature 1, with lambda=1.0). - // G_left = -1.5, H_left = 2.0 - // G_right = 2.5, H_right = 2.0 - // G_total = 1.0, H_total = 4.0 - // Gain = 0.5 * (G_l^2/(H_l+λ) + G_r^2/(H_r+λ) - G_t^2/(H_t+λ)) - // Gain = 0.5 * ((-1.5)^2/(2+1) + (2.5)^2/(2+1) - (1.0)^2/(4+1)) - // Gain = 0.5 * (2.25/3 + 6.25/3 - 1.0/5) = 0.5 * (0.75 + 2.0833 - 0.2) = 1.3166... - let expected_gain = 1.3166666666666667; - - // Search both features. The algorithm must find the best split on feature 1. - let (_, n_features) = data.shape(); - for i in 0..n_features { - TreeRegressor::, Vec>::find_best_split( - &data, - &g, - &h, - &idxs, - i, - &mut best_feature_idx, - &mut best_split_score, - &mut best_threshold, - 1.0, - 1.0, - 0.0, - ); - } - - assert_eq!(best_feature_idx, 1); // Should choose the second feature - assert!((best_split_score - expected_gain).abs() < 1e-9); - assert_eq!(best_threshold, 25.0); // (20 + 30) / 2 - } - - /// Tests that the TreeRegressor can build a simple one-level tree on multidimensional data. - #[test] - fn test_tree_regressor_fit_multidimensional() { - let data = vec![ - vec![1.0, 10.0], - vec![1.0, 20.0], - vec![1.0, 30.0], - vec![1.0, 40.0], - ]; - let data = DenseMatrix::from_2d_vec(&data).unwrap(); - let g = vec![-0.5, -1.0, 1.0, 1.5]; - let h = vec![1.0, 1.0, 1.0, 1.0]; - let idxs = (0..4).collect::>(); - - let tree = TreeRegressor::, Vec>::fit( - &data, &g, &h, &idxs, 2, 1.0, 1.0, 0.0, - ); - - // Check that the root node was split on the correct feature - assert!(tree.left.is_some()); - assert!(tree.right.is_some()); - assert_eq!(tree.split_feature_idx, 1); // Should split on the second feature - assert_eq!(tree.threshold, 25.0); - - // Check leaf values (G/H+lambda) - // Left leaf: G = -1.5, H = 2.0 => value = -(-1.5)/(2+1) = 0.5 - // Right leaf: G = 2.5, H = 2.0 => value = -(2.5)/(2+1) = -0.8333 - assert!((tree.left.unwrap().value - 0.5).abs() < 1e-9); - assert!((tree.right.unwrap().value - (-0.833333333)).abs() < 1e-9); - } - - /// A "smoke test" to ensure the main XGRegressor can fit and predict on multidimensional data. - #[test] - fn test_xgregressor_fit_predict_multidimensional() { - // Simple 2D data where y is roughly 2*x1 + 3*x2 - let x_vec = vec![ - vec![1.0, 1.0], - vec![2.0, 1.0], - vec![1.0, 2.0], - vec![2.0, 2.0], - ]; - let x = DenseMatrix::from_2d_vec(&x_vec).unwrap(); - let y = vec![5.0, 7.0, 8.0, 10.0]; - - let params = XGRegressorParameters::default() - .with_n_estimators(10) - .with_max_depth(2); - - let fit_result = XGRegressor::fit(&x, &y, params); - assert!( - fit_result.is_ok(), - "Fit failed with error: {:?}", - fit_result.err() - ); - - let model = fit_result.unwrap(); - let predict_result = model.predict(&x); - assert!( - predict_result.is_ok(), - "Predict failed with error: {:?}", - predict_result.err() - ); - - let predictions = predict_result.unwrap(); - assert_eq!(predictions.len(), 4); - } -} From 50537e4b3620c49d80794d3b52311f2eabcee69b Mon Sep 17 00:00:00 2001 From: Daniel Lacina Date: Wed, 9 Jul 2025 13:21:20 -0500 Subject: [PATCH 2/4] implemented extra trees --- src/xgboost/mod.rs | 16 + src/xgboost/xgb_regressor.rs | 762 +++++++++++++++++++++++++++++++++++ 2 files changed, 778 insertions(+) create mode 100644 src/xgboost/mod.rs create mode 100644 src/xgboost/xgb_regressor.rs diff --git a/src/xgboost/mod.rs b/src/xgboost/mod.rs new file mode 100644 index 00000000..68e087be --- /dev/null +++ b/src/xgboost/mod.rs @@ -0,0 +1,16 @@ +//! # XGBoost +//! +//! XGBoost, which stands for Extreme Gradient Boosting, is a powerful and efficient implementation of the gradient boosting framework. Gradient boosting is a machine learning technique for regression and classification problems, which produces a prediction model in the form of an ensemble of weak prediction models, typically decision trees. +//! +//! The core idea of boosting is to build the model in a stage-wise fashion. It learns from its mistakes by sequentially adding new models that correct the errors of the previous ones. Unlike bagging, which trains models in parallel, boosting is a sequential process. Each new tree is fit on a modified version of the original data set, specifically focusing on the instances where the previous models performed poorly. +//! +//! XGBoost enhances this process through several key innovations. It employs a more regularized model formalization to control over-fitting, which gives it better performance. It also has a highly optimized and parallelized tree construction process, making it significantly faster and more scalable than traditional gradient boosting implementations. +//! +//! ## References: +//! +//! * "Elements of Statistical Learning", Hastie T., Tibshirani R., Friedman J., 10. Boosting and Additive Trees +//! * XGBoost: A Scalable Tree Boosting System, Chen T., Guestrin C. + +// xgboost implementation +pub mod xgb_regressor; +pub use xgb_regressor::{XGRegressor, XGRegressorParameters}; diff --git a/src/xgboost/xgb_regressor.rs b/src/xgboost/xgb_regressor.rs new file mode 100644 index 00000000..ac6ec752 --- /dev/null +++ b/src/xgboost/xgb_regressor.rs @@ -0,0 +1,762 @@ +//! # Extreme Gradient Boosting (XGBoost) +//! +//! XGBoost is a highly efficient and effective implementation of the gradient boosting framework. +//! Like other boosting models, it builds an ensemble of sequential decision trees, where each new tree +//! is trained to correct the errors of the previous ones. +//! +//! What makes XGBoost powerful is its use of both the first and second derivatives (gradient and hessian) +//! of the loss function, which allows for more accurate approximations and faster convergence. It also +//! includes built-in regularization techniques (L1/`alpha` and L2/`lambda`) to prevent overfitting. +//! +//! This implementation was ported to Rust from the concepts and algorithm explained in the blog post +//! ["XGBoost from Scratch"](https://randomrealizations.com/posts/xgboost-from-scratch/). It is designed +//! to be a general-purpose regressor that can be used with any objective function that provides a gradient +//! and a hessian. +//! +//! Example: +//! +//! ``` +//! use smartcore::linalg::basic::matrix::DenseMatrix; +//! use smartcore::xgboost::{XGRegressor, XGRegressorParameters}; +//! +//! // Simple dataset: predict y = 2*x +//! let x = DenseMatrix::from_2d_array(&[ +//! &[1.0], &[2.0], &[3.0], &[4.0], &[5.0] +//! ]).unwrap(); +//! let y = vec![2.0, 4.0, 6.0, 8.0, 10.0]; +//! +//! // Use default parameters, but set a few for demonstration +//! let parameters = XGRegressorParameters::default() +//! .with_n_estimators(50) +//! .with_max_depth(3) +//! .with_learning_rate(0.1); +//! +//! // Train the model +//! let model = XGRegressor::fit(&x, &y, parameters).unwrap(); +//! +//! // Make predictions +//! let x_test = DenseMatrix::from_2d_array(&[&[6.0], &[7.0]]).unwrap(); +//! let y_hat = model.predict(&x_test).unwrap(); +//! +//! // y_hat should be close to [12.0, 14.0] +//! ``` +//! + +use rand::{seq::SliceRandom, Rng}; +use std::{iter::zip, marker::PhantomData}; + +use crate::{ + api::{PredictorBorrow, SupervisedEstimatorBorrow}, + error::{Failed, FailedError}, + linalg::basic::arrays::{Array1, Array2}, + numbers::basenum::Number, + rand_custom::get_rng_impl, +}; + +/// Defines the objective function to be optimized. +/// The objective function provides the loss, gradient (first derivative), and +/// hessian (second derivative) required for the XGBoost algorithm. +#[derive(Clone, Debug)] +pub enum Objective { + /// The objective for regression tasks using Mean Squared Error. + /// Loss: 0.5 * (y_true - y_pred)^2 + MeanSquaredError, +} + +impl Objective { + /// Calculates the loss for each sample given the true and predicted values. + /// + /// # Arguments + /// * `y_true` - A vector of the true target values. + /// * `y_pred` - A vector of the predicted values. + /// + /// # Returns + /// The mean of the calculated loss values. + pub fn loss_function>(&self, y_true: &Y, y_pred: &Vec) -> f64 { + match self { + Objective::MeanSquaredError => { + zip(y_true.iterator(0), y_pred) + .map(|(true_val, pred_val)| { + 0.5 * (true_val.to_f64().unwrap() - pred_val).powi(2) + }) + .sum::() + / y_true.shape() as f64 + } + } + } + + /// Calculates the gradient (first derivative) of the loss function. + /// + /// # Arguments + /// * `y_true` - A vector of the true target values. + /// * `y_pred` - A vector of the predicted values. + /// + /// # Returns + /// A vector of gradients for each sample. + pub fn gradient>(&self, y_true: &Y, y_pred: &Vec) -> Vec { + match self { + Objective::MeanSquaredError => zip(y_true.iterator(0), y_pred) + .map(|(true_val, pred_val)| (*pred_val - true_val.to_f64().unwrap())) + .collect(), + } + } + + /// Calculates the hessian (second derivative) of the loss function. + /// + /// # Arguments + /// * `y_true` - A vector of the true target values. + /// * `y_pred` - A vector of the predicted values. + /// + /// # Returns + /// A vector of hessians for each sample. + #[allow(unused_variables)] + pub fn hessian>(&self, y_true: &Y, y_pred: &[f64]) -> Vec { + match self { + Objective::MeanSquaredError => vec![1.0; y_true.shape()], + } + } +} + +/// Represents a single decision tree in the XGBoost ensemble. +/// +/// This is a recursive data structure where each `TreeRegressor` is a node +/// that can have a left and a right child, also of type `TreeRegressor`. +#[allow(dead_code)] +struct TreeRegressor, Y: Array1> { + left: Option>>, + right: Option>>, + /// The output value of this node. If it's a leaf, this is the final prediction. + value: f64, + /// The feature value threshold used to split this node. + threshold: f64, + /// The index of the feature used for splitting. + split_feature_idx: usize, + /// The gain in score achieved by this split. + split_score: f64, + _phantom_tx: PhantomData, + _phantom_ty: PhantomData, + _phantom_x: PhantomData, + _phantom_y: PhantomData, +} + +impl, Y: Array1> + TreeRegressor +{ + /// Recursively builds a decision tree (a `TreeRegressor` node). + /// + /// This function determines the optimal split for the given set of samples (`idxs`) + /// and then recursively calls itself to build the left and right child nodes. + /// + /// # Arguments + /// * `data` - The full training dataset. + /// * `g` - Gradients for all samples. + /// * `h` - Hessians for all samples. + /// * `idxs` - The indices of the samples belonging to the current node. + /// * `max_depth` - The maximum remaining depth for this branch. + /// * `min_child_weight` - The minimum sum of hessians required in a child node. + /// * `lambda` - L2 regularization term on weights. + /// * `gamma` - Minimum loss reduction required to make a further partition. + pub fn fit( + data: &X, + g: &Vec, + h: &Vec, + idxs: &[usize], + max_depth: u16, + min_child_weight: f64, + lambda: f64, + gamma: f64, + ) -> Self { + let g_sum = idxs.iter().map(|&i| g[i]).sum::(); + let h_sum = idxs.iter().map(|&i| h[i]).sum::(); + let value = -g_sum / (h_sum + lambda); + + let mut best_feature_idx = usize::MAX; + let mut best_split_score = 0.0; + let mut best_threshold = 0.0; + let mut left = Option::None; + let mut right = Option::None; + + if max_depth > 0 { + Self::insert_child_nodes( + data, + g, + h, + idxs, + &mut best_feature_idx, + &mut best_split_score, + &mut best_threshold, + &mut left, + &mut right, + max_depth, + min_child_weight, + lambda, + gamma, + ); + } + + Self { + left, + right, + value, + threshold: best_threshold, + split_feature_idx: best_feature_idx, + split_score: best_split_score, + _phantom_tx: PhantomData, + _phantom_ty: PhantomData, + _phantom_x: PhantomData, + _phantom_y: PhantomData, + } + } + + /// Finds the best split and creates child nodes if a valid split is found. + fn insert_child_nodes( + data: &X, + g: &Vec, + h: &Vec, + idxs: &[usize], + best_feature_idx: &mut usize, + best_split_score: &mut f64, + best_threshold: &mut f64, + left: &mut Option>, + right: &mut Option>, + max_depth: u16, + min_child_weight: f64, + lambda: f64, + gamma: f64, + ) { + let (_, n_features) = data.shape(); + for i in 0..n_features { + Self::find_best_split( + data, + g, + h, + idxs, + i, + best_feature_idx, + best_split_score, + best_threshold, + min_child_weight, + lambda, + gamma, + ); + } + + // A split is only valid if it results in a positive gain. + if *best_split_score > 0.0 { + let mut left_idxs = Vec::new(); + let mut right_idxs = Vec::new(); + for idx in idxs.iter() { + if data.get((*idx, *best_feature_idx)).to_f64().unwrap() <= *best_threshold { + left_idxs.push(*idx); + } else { + right_idxs.push(*idx); + } + } + + *left = Some(Box::new(TreeRegressor::fit( + data, + g, + h, + &left_idxs, + max_depth - 1, + min_child_weight, + lambda, + gamma, + ))); + *right = Some(Box::new(TreeRegressor::fit( + data, + g, + h, + &right_idxs, + max_depth - 1, + min_child_weight, + lambda, + gamma, + ))); + } + } + + /// Iterates through a single feature to find the best possible split point. + fn find_best_split( + data: &X, + g: &[f64], + h: &[f64], + idxs: &[usize], + feature_idx: usize, + best_feature_idx: &mut usize, + best_split_score: &mut f64, + best_threshold: &mut f64, + min_child_weight: f64, + lambda: f64, + gamma: f64, + ) { + let mut sorted_idxs = idxs.to_owned(); + sorted_idxs.sort_by(|a, b| { + data.get((*a, feature_idx)) + .partial_cmp(data.get((*b, feature_idx))) + .unwrap() + }); + + let sum_g = sorted_idxs.iter().map(|&i| g[i]).sum::(); + let sum_h = sorted_idxs.iter().map(|&i| h[i]).sum::(); + + let mut sum_g_right = sum_g; + let mut sum_h_right = sum_h; + let mut sum_g_left = 0.0; + let mut sum_h_left = 0.0; + + for i in 0..sorted_idxs.len() - 1 { + let idx = sorted_idxs[i]; + let next_idx = sorted_idxs[i + 1]; + + let g_i = g[idx]; + let h_i = h[idx]; + let x_i = data.get((idx, feature_idx)).to_f64().unwrap(); + let x_i_next = data.get((next_idx, feature_idx)).to_f64().unwrap(); + + sum_g_left += g_i; + sum_h_left += h_i; + sum_g_right -= g_i; + sum_h_right -= h_i; + + if sum_h_left < min_child_weight || x_i == x_i_next { + continue; + } + if sum_h_right < min_child_weight { + break; + } + + let gain = 0.5 + * ((sum_g_left * sum_g_left / (sum_h_left + lambda)) + + (sum_g_right * sum_g_right / (sum_h_right + lambda)) + - (sum_g * sum_g / (sum_h + lambda))) + - gamma; + + if gain > *best_split_score { + *best_split_score = gain; + *best_threshold = (x_i + x_i_next) / 2.0; + *best_feature_idx = feature_idx; + } + } + } + + /// Predicts the output values for a dataset. + pub fn predict(&self, data: &X) -> Vec { + let (n_samples, n_features) = data.shape(); + (0..n_samples) + .map(|i| { + self.predict_for_row(&Vec::from_iterator( + data.get_row(i).iterator(0).copied(), + n_features, + )) + }) + .collect() + } + + /// Predicts the output value for a single row of data by traversing the tree. + pub fn predict_for_row(&self, row: &Vec) -> f64 { + // A leaf node is identified by having no children. + if self.left.is_none() { + return self.value; + } + + // Recurse down the appropriate branch. + let child = if row[self.split_feature_idx].to_f64().unwrap() <= self.threshold { + self.left.as_ref().unwrap() + } else { + self.right.as_ref().unwrap() + }; + + child.predict_for_row(row) + } +} + +/// Parameters for the `jRegressor` model. +/// +/// This struct holds all the hyperparameters that control the training process. +#[derive(Clone, Debug)] +pub struct XGRegressorParameters { + /// The number of boosting rounds or trees to build. + pub n_estimators: usize, + /// The maximum depth of each tree. + pub max_depth: u16, + /// Step size shrinkage used to prevent overfitting. + pub learning_rate: f64, + /// Minimum sum of instance weight (hessian) needed in a child. + pub min_child_weight: usize, + /// L2 regularization term on weights. + pub lambda: f64, + /// Minimum loss reduction required to make a further partition on a leaf node. + pub gamma: f64, + /// The initial prediction score for all instances. + pub base_score: f64, + /// The fraction of samples to be used for fitting the individual base learners. + pub subsample: f64, + /// The seed for the random number generator for reproducibility. + pub seed: u64, + /// The objective function to be optimized. + pub objective: Objective, +} + +impl Default for XGRegressorParameters { + /// Creates a new set of `XGRegressorParameters` with default values. + fn default() -> Self { + Self { + n_estimators: 100, + learning_rate: 0.3, + max_depth: 6, + min_child_weight: 1, + lambda: 1.0, + gamma: 0.0, + base_score: 0.5, + subsample: 1.0, + seed: 0, + objective: Objective::MeanSquaredError, + } + } +} + +// Builder pattern for XGRegressorParameters +impl XGRegressorParameters { + /// Sets the number of boosting rounds or trees to build. + pub fn with_n_estimators(mut self, n_estimators: usize) -> Self { + self.n_estimators = n_estimators; + self + } + + /// Sets the step size shrinkage used to prevent overfitting. + /// + /// Also known as `eta`. A smaller value makes the model more robust by preventing + /// too much weight being given to any single tree. + pub fn with_learning_rate(mut self, learning_rate: f64) -> Self { + self.learning_rate = learning_rate; + self + } + + /// Sets the maximum depth of each individual tree. + // A lower value helps prevent overfitting.* + pub fn with_max_depth(mut self, max_depth: u16) -> Self { + self.max_depth = max_depth; + self + } + + /// Sets the minimum sum of instance weight (hessian) needed in a child node. + /// + /// If the tree partition step results in a leaf node with the sum of + // instance weight less than `min_child_weight`, then the building process* + /// will give up further partitioning. + pub fn with_min_child_weight(mut self, min_child_weight: usize) -> Self { + self.min_child_weight = min_child_weight; + self + } + + /// Sets the L2 regularization term on weights (`lambda`). + /// + /// Increasing this value will make the model more conservative. + pub fn with_lambda(mut self, lambda: f64) -> Self { + self.lambda = lambda; + self + } + + /// Sets the minimum loss reduction required to make a further partition on a leaf node. + /// + /// The larger `gamma` is, the more conservative the algorithm will be. + pub fn with_gamma(mut self, gamma: f64) -> Self { + self.gamma = gamma; + self + } + + /// Sets the initial prediction score for all instances. + pub fn with_base_score(mut self, base_score: f64) -> Self { + self.base_score = base_score; + self + } + + /// Sets the fraction of samples to be used for fitting individual base learners. + /// + /// A value of less than 1.0 introduces randomness and helps prevent overfitting. + pub fn with_subsample(mut self, subsample: f64) -> Self { + self.subsample = subsample; + self + } + + /// Sets the seed for the random number generator for reproducibility. + pub fn with_seed(mut self, seed: u64) -> Self { + self.seed = seed; + self + } + + /// Sets the objective function to be optimized during training. + pub fn with_objective(mut self, objective: Objective) -> Self { + self.objective = objective; + self + } +} + +/// An Extreme Gradient Boosting (XGBoost) model for regression and classification tasks. +pub struct XGRegressor, Y: Array1> { + regressors: Option>>, + parameters: Option, + _phantom_ty: PhantomData, + _phantom_tx: PhantomData, + _phantom_y: PhantomData, + _phantom_x: PhantomData, +} + +impl, Y: Array1> XGRegressor { + /// Fits the XGBoost model to the training data. + pub fn fit(data: &X, y: &Y, parameters: XGRegressorParameters) -> Result { + if parameters.subsample > 1.0 || parameters.subsample <= 0.0 { + return Err(Failed::because( + FailedError::ParametersError, + "Subsample ratio must be in (0, 1].", + )); + } + + let (n_samples, _) = data.shape(); + let learning_rate = parameters.learning_rate; + let mut predictions = vec![parameters.base_score; n_samples]; + + let mut regressors = Vec::new(); + let mut rng = get_rng_impl(Some(parameters.seed)); + + for _ in 0..parameters.n_estimators { + let gradients = parameters.objective.gradient(y, &predictions); + let hessians = parameters.objective.hessian(y, &predictions); + + let sample_idxs = if parameters.subsample < 1.0 { + Self::sample_without_replacement(n_samples, parameters.subsample, &mut rng) + } else { + (0..n_samples).collect::>() + }; + + let regressor = TreeRegressor::fit( + data, + &gradients, + &hessians, + &sample_idxs, + parameters.max_depth, + parameters.min_child_weight as f64, + parameters.lambda, + parameters.gamma, + ); + + let corrections = regressor.predict(data); + predictions = zip(predictions, corrections) + .map(|(pred, correction)| pred + (learning_rate * correction)) + .collect(); + + regressors.push(regressor); + } + + Ok(Self { + regressors: Some(regressors), + parameters: Some(parameters), + _phantom_ty: PhantomData, + _phantom_y: PhantomData, + _phantom_tx: PhantomData, + _phantom_x: PhantomData, + }) + } + + /// Predicts target values for the given input data. + pub fn predict(&self, data: &X) -> Result, Failed> { + let (n_samples, _) = data.shape(); + + let parameters = self.parameters.as_ref().unwrap(); + let mut predictions = vec![parameters.base_score; n_samples]; + let regressors = self.regressors.as_ref().unwrap(); + + for regressor in regressors.iter() { + let corrections = regressor.predict(data); + predictions = zip(predictions, corrections) + .map(|(pred, correction)| pred + (parameters.learning_rate * correction)) + .collect(); + } + + Ok(predictions + .into_iter() + .map(|p| TX::from_f64(p).unwrap()) + .collect()) + } + + /// Creates a random sample of indices without replacement. + fn sample_without_replacement( + population_size: usize, + subsample_ratio: f64, + rng: &mut impl Rng, + ) -> Vec { + let mut indices: Vec = (0..population_size).collect(); + indices.shuffle(rng); + indices.truncate((population_size as f64 * subsample_ratio) as usize); + indices + } +} + +// Boilerplate implementation for the smartcore traits +impl, Y: Array1> + SupervisedEstimatorBorrow<'_, X, Y, XGRegressorParameters> for XGRegressor +{ + fn new() -> Self { + Self { + regressors: None, + parameters: None, + _phantom_ty: PhantomData, + _phantom_y: PhantomData, + _phantom_tx: PhantomData, + _phantom_x: PhantomData, + } + } + + fn fit(x: &X, y: &Y, parameters: &XGRegressorParameters) -> Result { + XGRegressor::fit(x, y, parameters.clone()) + } +} + +impl, Y: Array1> PredictorBorrow<'_, X, TX> + for XGRegressor +{ + fn predict(&self, x: &X) -> Result, Failed> { + self.predict(x) + } +} + +// ------------------- TESTS ------------------- +#[cfg(test)] +mod tests { + use super::*; + use crate::linalg::basic::{arrays::Array, matrix::DenseMatrix}; + + /// Tests the gradient and hessian calculations for MeanSquaredError. + #[test] + fn test_mse_objective() { + let objective = Objective::MeanSquaredError; + let y_true = vec![1.0, 2.0, 3.0]; + let y_pred = vec![1.5, 2.5, 2.5]; + + let gradients = objective.gradient(&y_true, &y_pred); + let hessians = objective.hessian(&y_true, &y_pred); + + // Gradients should be (pred - true) + assert_eq!(gradients, vec![0.5, 0.5, -0.5]); + // Hessians should be all 1.0 for MSE + assert_eq!(hessians, vec![1.0, 1.0, 1.0]); + } + + #[test] + fn test_find_best_split_multidimensional() { + // Data has two features. The second feature is a better predictor. + let data = vec![ + vec![1.0, 10.0], // g = -0.5 + vec![1.0, 20.0], // g = -1.0 + vec![1.0, 30.0], // g = 1.0 + vec![1.0, 40.0], // g = 1.5 + ]; + let data = DenseMatrix::from_2d_vec(&data).unwrap(); + let g = vec![-0.5, -1.0, 1.0, 1.5]; + let h = vec![1.0, 1.0, 1.0, 1.0]; + let idxs = (0..4).collect::>(); + + let mut best_feature_idx = usize::MAX; + let mut best_split_score = 0.0; + let mut best_threshold = 0.0; + + // Manually calculated expected gain for the best split (on feature 1, with lambda=1.0). + // G_left = -1.5, H_left = 2.0 + // G_right = 2.5, H_right = 2.0 + // G_total = 1.0, H_total = 4.0 + // Gain = 0.5 * (G_l^2/(H_l+λ) + G_r^2/(H_r+λ) - G_t^2/(H_t+λ)) + // Gain = 0.5 * ((-1.5)^2/(2+1) + (2.5)^2/(2+1) - (1.0)^2/(4+1)) + // Gain = 0.5 * (2.25/3 + 6.25/3 - 1.0/5) = 0.5 * (0.75 + 2.0833 - 0.2) = 1.3166... + let expected_gain = 1.3166666666666667; + + // Search both features. The algorithm must find the best split on feature 1. + let (_, n_features) = data.shape(); + for i in 0..n_features { + TreeRegressor::, Vec>::find_best_split( + &data, + &g, + &h, + &idxs, + i, + &mut best_feature_idx, + &mut best_split_score, + &mut best_threshold, + 1.0, + 1.0, + 0.0, + ); + } + + assert_eq!(best_feature_idx, 1); // Should choose the second feature + assert!((best_split_score - expected_gain).abs() < 1e-9); + assert_eq!(best_threshold, 25.0); // (20 + 30) / 2 + } + + /// Tests that the TreeRegressor can build a simple one-level tree on multidimensional data. + #[test] + fn test_tree_regressor_fit_multidimensional() { + let data = vec![ + vec![1.0, 10.0], + vec![1.0, 20.0], + vec![1.0, 30.0], + vec![1.0, 40.0], + ]; + let data = DenseMatrix::from_2d_vec(&data).unwrap(); + let g = vec![-0.5, -1.0, 1.0, 1.5]; + let h = vec![1.0, 1.0, 1.0, 1.0]; + let idxs = (0..4).collect::>(); + + let tree = TreeRegressor::, Vec>::fit( + &data, &g, &h, &idxs, 2, 1.0, 1.0, 0.0, + ); + + // Check that the root node was split on the correct feature + assert!(tree.left.is_some()); + assert!(tree.right.is_some()); + assert_eq!(tree.split_feature_idx, 1); // Should split on the second feature + assert_eq!(tree.threshold, 25.0); + + // Check leaf values (G/H+lambda) + // Left leaf: G = -1.5, H = 2.0 => value = -(-1.5)/(2+1) = 0.5 + // Right leaf: G = 2.5, H = 2.0 => value = -(2.5)/(2+1) = -0.8333 + assert!((tree.left.unwrap().value - 0.5).abs() < 1e-9); + assert!((tree.right.unwrap().value - (-0.833333333)).abs() < 1e-9); + } + + /// A "smoke test" to ensure the main XGRegressor can fit and predict on multidimensional data. + #[test] + fn test_xgregressor_fit_predict_multidimensional() { + // Simple 2D data where y is roughly 2*x1 + 3*x2 + let x_vec = vec![ + vec![1.0, 1.0], + vec![2.0, 1.0], + vec![1.0, 2.0], + vec![2.0, 2.0], + ]; + let x = DenseMatrix::from_2d_vec(&x_vec).unwrap(); + let y = vec![5.0, 7.0, 8.0, 10.0]; + + let params = XGRegressorParameters::default() + .with_n_estimators(10) + .with_max_depth(2); + + let fit_result = XGRegressor::fit(&x, &y, params); + assert!( + fit_result.is_ok(), + "Fit failed with error: {:?}", + fit_result.err() + ); + + let model = fit_result.unwrap(); + let predict_result = model.predict(&x); + assert!( + predict_result.is_ok(), + "Predict failed with error: {:?}", + predict_result.err() + ); + + let predictions = predict_result.unwrap(); + assert_eq!(predictions.len(), 4); + } +} From 35bf52b5e4012ceeea3c5fcdfdf2667b32aa6a4c Mon Sep 17 00:00:00 2001 From: Daniel Lacina Date: Wed, 9 Jul 2025 13:23:13 -0500 Subject: [PATCH 3/4] fixed unecessary changes --- Cargo.toml | 2 -- 1 file changed, 2 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index e48b5d0c..3c1b8ab9 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -28,8 +28,6 @@ num = "0.4" rand = { version = "0.8.5", default-features = false, features = ["small_rng"] } rand_distr = { version = "0.4", optional = true } serde = { version = "1", features = ["derive"], optional = true } -keyed_priority_queue = "0.4.2" -ordered-float = "5.0.0" [target.'cfg(not(target_arch = "wasm32"))'.dependencies] typetag = { version = "0.2", optional = true } From 50ac3976b39d993e6035bba15bedefba03407e2c Mon Sep 17 00:00:00 2001 From: Daniel Lacina Date: Wed, 9 Jul 2025 13:24:35 -0500 Subject: [PATCH 4/4] fixed unecessary changes --- src/tree/decision_tree_regressor.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/src/tree/decision_tree_regressor.rs b/src/tree/decision_tree_regressor.rs index af4e2748..0f574402 100644 --- a/src/tree/decision_tree_regressor.rs +++ b/src/tree/decision_tree_regressor.rs @@ -584,7 +584,6 @@ impl, Y: Array1> use_random_splits: bool, ) { if use_random_splits { - // --- EFFICIENT LOGIC for Continuous Random Splits --- let (min_val, max_val) = { let mut min_opt = None; let mut max_opt = None;