From 4b2d53ba38430280185860abccf4a9ef0d486fc9 Mon Sep 17 00:00:00 2001
From: Daniel Lacina <daniellacina8@gmail.com>
Date: Wed, 9 Jul 2025 13:20:37 -0500
Subject: [PATCH 1/4] implemented extra trees

---
 Cargo.toml                              |   2 +
 src/ensemble/extra_trees_regressor.rs   | 320 ++++++++++
 src/ensemble/forest_regressor.rs        | 222 +++++++
 src/ensemble/mod.rs                     |   2 +
 src/ensemble/random_forest_regressor.rs | 151 +----
 src/lib.rs                              |   1 -
 src/tree/decision_tree_regressor.rs     | 178 ++++--
 src/xgboost/mod.rs                      |  16 -
 src/xgboost/xgb_regressor.rs            | 762 ------------------------
 9 files changed, 700 insertions(+), 954 deletions(-)
 create mode 100644 src/ensemble/extra_trees_regressor.rs
 create mode 100644 src/ensemble/forest_regressor.rs
 delete mode 100644 src/xgboost/mod.rs
 delete mode 100644 src/xgboost/xgb_regressor.rs

diff --git a/Cargo.toml b/Cargo.toml
index 3c1b8ab9..e48b5d0c 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -28,6 +28,8 @@ num = "0.4"
 rand = { version = "0.8.5", default-features = false, features = ["small_rng"] }
 rand_distr = { version = "0.4", optional = true }
 serde = { version = "1", features = ["derive"], optional = true }
+keyed_priority_queue = "0.4.2"
+ordered-float = "5.0.0"
 
 [target.'cfg(not(target_arch = "wasm32"))'.dependencies]
 typetag = { version = "0.2", optional = true }
diff --git a/src/ensemble/extra_trees_regressor.rs b/src/ensemble/extra_trees_regressor.rs
new file mode 100644
index 00000000..88ea1ce8
--- /dev/null
+++ b/src/ensemble/extra_trees_regressor.rs
@@ -0,0 +1,320 @@
+//! # Extra Trees Regressor
+//! An Extra-Trees (Extremely Randomized Trees) regressor is an ensemble learning method that fits multiple randomized
+//! decision trees on the dataset and averages their predictions to improve accuracy and control over-fitting.
+//!
+//! It is similar to a standard Random Forest, but introduces more randomness in the way splits are chosen, which can
+//! reduce the variance of the model and often make the training process faster.
+//!
+//! The two key differences from a standard Random Forest are:
+//! 1. It uses the whole original dataset to build each tree instead of bootstrap samples.
+//! 2. When splitting a node, it chooses a random split point for each feature, rather than the most optimal one.
+//!
+//! See [ensemble models](../index.html) for more details.
+//!
+//! Bigger number of estimators in general improves performance of the algorithm with an increased cost of training time.
+//! The random sample of _m_ predictors is typically set to be \\(\sqrt{p}\\) from the full set of _p_ predictors.
+//!
+//! Example:
+//!
+//! ```
+//! use smartcore::linalg::basic::matrix::DenseMatrix;
+//! use smartcore::ensemble::extra_trees_regressor::*;
+//!
+//! // Longley dataset ([https://www.statsmodels.org/stable/datasets/generated/longley.html](https://www.statsmodels.org/stable/datasets/generated/longley.html))
+//! let x = DenseMatrix::from_2d_array(&[
+//!     &[234.289, 235.6, 159., 107.608, 1947., 60.323],
+//!     &[259.426, 232.5, 145.6, 108.632, 1948., 61.122],
+//!     &[258.054, 368.2, 161.6, 109.773, 1949., 60.171],
+//!     &[284.599, 335.1, 165., 110.929, 1950., 61.187],
+//!     &[328.975, 209.9, 309.9, 112.075, 1951., 63.221],
+//!     &[346.999, 193.2, 359.4, 113.27, 1952., 63.639],
+//!     &[365.385, 187., 354.7, 115.094, 1953., 64.989],
+//!     &[363.112, 357.8, 335., 116.219, 1954., 63.761],
+//!     &[397.469, 290.4, 304.8, 117.388, 1955., 66.019],
+//!     &[419.18, 282.2, 285.7, 118.734, 1956., 67.857],
+//!     &[442.769, 293.6, 279.8, 120.445, 1957., 68.169],
+//!     &[444.546, 468.1, 263.7, 121.95, 1958., 66.513],
+//!     &[482.704, 381.3, 255.2, 123.366, 1959., 68.655],
+//!     &[502.601, 393.1, 251.4, 125.368, 1960., 69.564],
+//!     &[518.173, 480.6, 257.2, 127.852, 1961., 69.331],
+//!     &[554.894, 400.7, 282.7, 130.081, 1962., 70.551],
+//! ]).unwrap();
+//! let y = vec![
+//!     83.0, 88.5, 88.2, 89.5, 96.2, 98.1, 99.0, 100.0, 101.2,
+//!     104.6, 108.4, 110.8, 112.6, 114.2, 115.7, 116.9
+//! ];
+//!
+//! let regressor = ExtraTreesRegressor::fit(&x, &y, Default::default()).unwrap();
+//!
+//! let y_hat = regressor.predict(&x).unwrap(); // use the same data for prediction
+//! ```
+//!
+//! <script src="https://polyfill.io/v3/polyfill.min.js?features=es6"></script>
+//! <script id="MathJax-script" async src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js"></script>
+
+use std::default::Default;
+use std::fmt::Debug;
+
+#[cfg(feature = "serde")]
+use serde::{Deserialize, Serialize};
+
+use crate::api::{Predictor, SupervisedEstimator};
+use crate::ensemble::forest_regressor::ForestRegressorParameters;
+use crate::error::Failed;
+use crate::linalg::basic::arrays::{Array1, Array2};
+use crate::numbers::basenum::Number;
+use crate::numbers::floatnum::FloatNumber;
+
+use super::forest_regressor::ForestRegressor;
+
+#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
+#[derive(Debug, Clone)]
+/// Parameters of the Extra Trees Regressor
+/// Some parameters here are passed directly into base estimator.
+pub struct ExtraTreesRegressorParameters {
+    #[cfg_attr(feature = "serde", serde(default))]
+    /// Tree max depth. See [Decision Tree Regressor](../../tree/decision_tree_regressor/index.html)
+    pub max_depth: Option<u16>,
+    #[cfg_attr(feature = "serde", serde(default))]
+    /// The minimum number of samples required to be at a leaf node. See [Decision Tree Regressor](../../tree/decision_tree_regressor/index.html)
+    pub min_samples_leaf: usize,
+    #[cfg_attr(feature = "serde", serde(default))]
+    /// The minimum number of samples required to split an internal node. See [Decision Tree Regressor](../../tree/decision_tree_regressor/index.html)
+    pub min_samples_split: usize,
+    #[cfg_attr(feature = "serde", serde(default))]
+    /// The number of trees in the forest.
+    pub n_trees: usize,
+    #[cfg_attr(feature = "serde", serde(default))]
+    /// Number of random sample of predictors to use as split candidates.
+    pub m: Option<usize>,
+    #[cfg_attr(feature = "serde", serde(default))]
+    /// Whether to keep samples used for tree generation. This is required for OOB prediction.
+    pub keep_samples: bool,
+    #[cfg_attr(feature = "serde", serde(default))]
+    /// Seed used for bootstrap sampling and feature selection for each tree.
+    pub seed: u64,
+}
+
+/// Extra Trees Regressor
+#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
+#[derive(Debug)]
+pub struct ExtraTreesRegressor<
+    TX: Number + FloatNumber + PartialOrd,
+    TY: Number,
+    X: Array2<TX>,
+    Y: Array1<TY>,
+> {
+    forest_regressor: Option<ForestRegressor<TX, TY, X, Y>>,
+}
+
+impl ExtraTreesRegressorParameters {
+    /// Tree max depth. See [Decision Tree Classifier](../../tree/decision_tree_classifier/index.html)
+    pub fn with_max_depth(mut self, max_depth: u16) -> Self {
+        self.max_depth = Some(max_depth);
+        self
+    }
+    /// The minimum number of samples required to be at a leaf node. See [Decision Tree Classifier](../../tree/decision_tree_classifier/index.html)
+    pub fn with_min_samples_leaf(mut self, min_samples_leaf: usize) -> Self {
+        self.min_samples_leaf = min_samples_leaf;
+        self
+    }
+    /// The minimum number of samples required to split an internal node. See [Decision Tree Classifier](../../tree/decision_tree_classifier/index.html)
+    pub fn with_min_samples_split(mut self, min_samples_split: usize) -> Self {
+        self.min_samples_split = min_samples_split;
+        self
+    }
+    /// The number of trees in the forest.
+    pub fn with_n_trees(mut self, n_trees: usize) -> Self {
+        self.n_trees = n_trees;
+        self
+    }
+    /// Number of random sample of predictors to use as split candidates.
+    pub fn with_m(mut self, m: usize) -> Self {
+        self.m = Some(m);
+        self
+    }
+
+    /// Whether to keep samples used for tree generation. This is required for OOB prediction.
+    pub fn with_keep_samples(mut self, keep_samples: bool) -> Self {
+        self.keep_samples = keep_samples;
+        self
+    }
+
+    /// Seed used for bootstrap sampling and feature selection for each tree.
+    pub fn with_seed(mut self, seed: u64) -> Self {
+        self.seed = seed;
+        self
+    }
+}
+impl Default for ExtraTreesRegressorParameters {
+    fn default() -> Self {
+        ExtraTreesRegressorParameters {
+            max_depth: Option::None,
+            min_samples_leaf: 1,
+            min_samples_split: 2,
+            n_trees: 10,
+            m: Option::None,
+            keep_samples: false,
+            seed: 0,
+        }
+    }
+}
+
+impl<TX: Number + FloatNumber + PartialOrd, TY: Number, X: Array2<TX>, Y: Array1<TY>>
+    SupervisedEstimator<X, Y, ExtraTreesRegressorParameters> for ExtraTreesRegressor<TX, TY, X, Y>
+{
+    fn new() -> Self {
+        Self {
+            forest_regressor: Option::None,
+        }
+    }
+
+    fn fit(x: &X, y: &Y, parameters: ExtraTreesRegressorParameters) -> Result<Self, Failed> {
+        ExtraTreesRegressor::fit(x, y, parameters)
+    }
+}
+
+impl<TX: Number + FloatNumber + PartialOrd, TY: Number, X: Array2<TX>, Y: Array1<TY>>
+    Predictor<X, Y> for ExtraTreesRegressor<TX, TY, X, Y>
+{
+    fn predict(&self, x: &X) -> Result<Y, Failed> {
+        self.predict(x)
+    }
+}
+
+impl<TX: Number + FloatNumber + PartialOrd, TY: Number, X: Array2<TX>, Y: Array1<TY>>
+    ExtraTreesRegressor<TX, TY, X, Y>
+{
+    /// Build a forest of trees from the training set.
+    /// * `x` - _NxM_ matrix with _N_ observations and _M_ features in each observation.
+    /// * `y` - the target class values
+    pub fn fit(
+        x: &X,
+        y: &Y,
+        parameters: ExtraTreesRegressorParameters,
+    ) -> Result<ExtraTreesRegressor<TX, TY, X, Y>, Failed> {
+        let regressor_params = ForestRegressorParameters {
+            max_depth: parameters.max_depth,
+            min_samples_leaf: parameters.min_samples_leaf,
+            min_samples_split: parameters.min_samples_split,
+            n_trees: parameters.n_trees,
+            m: parameters.m,
+            keep_samples: parameters.keep_samples,
+            seed: parameters.seed,
+            bootstrap: false,
+            use_random_splits: true,
+        };
+        let forest_regressor = ForestRegressor::fit(x, y, regressor_params)?;
+
+        Ok(ExtraTreesRegressor {
+            forest_regressor: Some(forest_regressor),
+        })
+    }
+
+    /// Predict class for `x`
+    /// * `x` - _KxM_ data where _K_ is number of observations and _M_ is number of features.
+    pub fn predict(&self, x: &X) -> Result<Y, Failed> {
+        let forest_regressor = self.forest_regressor.as_ref().unwrap();
+        forest_regressor.predict(x)
+    }
+
+    /// Predict OOB classes for `x`. `x` is expected to be equal to the dataset used in training.
+    pub fn predict_oob(&self, x: &X) -> Result<Y, Failed> {
+        let forest_regressor = self.forest_regressor.as_ref().unwrap();
+        forest_regressor.predict_oob(x)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::linalg::basic::matrix::DenseMatrix;
+    use crate::metrics::mean_squared_error;
+
+    #[test]
+    fn test_extra_trees_regressor_fit_predict() {
+        // Use a simpler, more predictable dataset for unit testing.
+        let x = DenseMatrix::from_2d_array(&[
+            &[1., 2.],
+            &[3., 4.],
+            &[5., 6.],
+            &[7., 8.],
+            &[9., 10.],
+            &[11., 12.],
+            &[13., 14.],
+            &[15., 16.],
+        ])
+        .unwrap();
+        let y = vec![1., 2., 3., 4., 5., 6., 7., 8.];
+
+        let parameters = ExtraTreesRegressorParameters::default()
+            .with_n_trees(100)
+            .with_seed(42);
+
+        let regressor = ExtraTreesRegressor::fit(&x, &y, parameters).unwrap();
+        let y_hat = regressor.predict(&x).unwrap();
+
+        assert_eq!(y_hat.len(), y.len());
+        // A basic check to ensure the model is learning something.
+        // The error should be significantly less than the variance of y.
+        let mse = mean_squared_error(&y, &y_hat);
+        println!("{}", mse);
+        // With this simple dataset, the error should be very low.
+        assert!(mse < 1.0);
+    }
+
+    #[test]
+    fn test_fit_predict_higher_dims() {
+        // Dataset with 10 features, but y is only dependent on the 3rd feature (index 2).
+        let x = DenseMatrix::from_2d_array(&[
+            // The 3rd column is the important one. The rest are noise.
+            &[0., 0., 10., 5., 8., 1., 4., 9., 2., 7.],
+            &[0., 0., 20., 1., 2., 3., 4., 5., 6., 7.],
+            &[0., 0., 30., 7., 6., 5., 4., 3., 2., 1.],
+            &[0., 0., 40., 9., 2., 4., 6., 8., 1., 3.],
+            &[0., 0., 55., 3., 1., 8., 6., 4., 2., 9.],
+            &[0., 0., 65., 2., 4., 7., 5., 3., 1., 8.],
+        ])
+        .unwrap();
+        let y = vec![10., 20., 30., 40., 55., 65.];
+
+        let parameters = ExtraTreesRegressorParameters::default()
+            .with_n_trees(100)
+            .with_seed(42);
+
+        let regressor = ExtraTreesRegressor::fit(&x, &y, parameters).unwrap();
+        let y_hat = regressor.predict(&x).unwrap();
+
+        assert_eq!(y_hat.len(), y.len());
+
+        let mse = mean_squared_error(&y, &y_hat);
+
+        // The model should be able to learn this simple relationship perfectly,
+        // ignoring the noise features. The MSE should be very low.
+        assert!(mse < 1.0);
+    }
+
+    #[test]
+    fn test_reproducibility() {
+        let x = DenseMatrix::from_2d_array(&[
+            &[1., 2.],
+            &[3., 4.],
+            &[5., 6.],
+            &[7., 8.],
+            &[9., 10.],
+            &[11., 12.],
+        ])
+        .unwrap();
+        let y = vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0];
+
+        let params = ExtraTreesRegressorParameters::default().with_seed(42);
+
+        let regressor1 = ExtraTreesRegressor::fit(&x, &y, params.clone()).unwrap();
+        let y_hat1 = regressor1.predict(&x).unwrap();
+
+        let regressor2 = ExtraTreesRegressor::fit(&x, &y, params.clone()).unwrap();
+        let y_hat2 = regressor2.predict(&x).unwrap();
+
+        assert_eq!(y_hat1, y_hat2);
+    }
+}
diff --git a/src/ensemble/forest_regressor.rs b/src/ensemble/forest_regressor.rs
new file mode 100644
index 00000000..88d07bd7
--- /dev/null
+++ b/src/ensemble/forest_regressor.rs
@@ -0,0 +1,222 @@
+use rand::Rng;
+use std::fmt::Debug;
+
+#[cfg(feature = "serde")]
+use serde::{Deserialize, Serialize};
+
+use crate::error::{Failed, FailedError};
+use crate::linalg::basic::arrays::{Array1, Array2};
+use crate::numbers::basenum::Number;
+use crate::numbers::floatnum::FloatNumber;
+
+use crate::rand_custom::get_rng_impl;
+use crate::tree::decision_tree_regressor::{
+    DecisionTreeRegressor, DecisionTreeRegressorParameters,
+};
+
+#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
+#[derive(Debug, Clone)]
+/// Parameters of the Forest Regressor
+/// Some parameters here are passed directly into base estimator.
+pub struct ForestRegressorParameters {
+    #[cfg_attr(feature = "serde", serde(default))]
+    /// Tree max depth. See [Decision Tree Regressor](../../tree/decision_tree_regressor/index.html)
+    pub max_depth: Option<u16>,
+    #[cfg_attr(feature = "serde", serde(default))]
+    /// The minimum number of samples required to be at a leaf node. See [Decision Tree Regressor](../../tree/decision_tree_regressor/index.html)
+    pub min_samples_leaf: usize,
+    #[cfg_attr(feature = "serde", serde(default))]
+    /// The minimum number of samples required to split an internal node. See [Decision Tree Regressor](../../tree/decision_tree_regressor/index.html)
+    pub min_samples_split: usize,
+    #[cfg_attr(feature = "serde", serde(default))]
+    /// The number of trees in the forest.
+    pub n_trees: usize,
+    #[cfg_attr(feature = "serde", serde(default))]
+    /// Number of random sample of predictors to use as split candidates.
+    pub m: Option<usize>,
+    #[cfg_attr(feature = "serde", serde(default))]
+    /// Whether to keep samples used for tree generation. This is required for OOB prediction.
+    pub keep_samples: bool,
+    #[cfg_attr(feature = "serde", serde(default))]
+    /// Seed used for bootstrap sampling and feature selection for each tree.
+    pub seed: u64,
+    #[cfg_attr(feature = "serde", serde(default))]
+    pub bootstrap: bool,
+    #[cfg_attr(feature = "serde", serde(default))]
+    pub use_random_splits: bool,
+}
+
+impl<TX: Number + FloatNumber + PartialOrd, TY: Number, X: Array2<TX>, Y: Array1<TY>> PartialEq
+    for ForestRegressor<TX, TY, X, Y>
+{
+    fn eq(&self, other: &Self) -> bool {
+        if self.trees.as_ref().unwrap().len() != other.trees.as_ref().unwrap().len() {
+            false
+        } else {
+            self.trees
+                .iter()
+                .zip(other.trees.iter())
+                .all(|(a, b)| a == b)
+        }
+    }
+}
+
+/// Forest Regressor
+#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
+#[derive(Debug)]
+pub struct ForestRegressor<
+    TX: Number + FloatNumber + PartialOrd,
+    TY: Number,
+    X: Array2<TX>,
+    Y: Array1<TY>,
+> {
+    trees: Option<Vec<DecisionTreeRegressor<TX, TY, X, Y>>>,
+    samples: Option<Vec<Vec<bool>>>,
+}
+
+impl<TX: Number + FloatNumber + PartialOrd, TY: Number, X: Array2<TX>, Y: Array1<TY>>
+    ForestRegressor<TX, TY, X, Y>
+{
+    /// Build a forest of trees from the training set.
+    /// * `x` - _NxM_ matrix with _N_ observations and _M_ features in each observation.
+    /// * `y` - the target class values
+    pub fn fit(
+        x: &X,
+        y: &Y,
+        parameters: ForestRegressorParameters,
+    ) -> Result<ForestRegressor<TX, TY, X, Y>, Failed> {
+        let (n_rows, num_attributes) = x.shape();
+
+        if n_rows != y.shape() {
+            return Err(Failed::fit("Number of rows in X should = len(y)"));
+        }
+
+        let mtry = parameters
+            .m
+            .unwrap_or((num_attributes as f64).sqrt().floor() as usize);
+
+        let mut rng = get_rng_impl(Some(parameters.seed));
+        let mut trees: Vec<DecisionTreeRegressor<TX, TY, X, Y>> = Vec::new();
+
+        let mut maybe_all_samples: Option<Vec<Vec<bool>>> = Option::None;
+        if parameters.keep_samples {
+            // TODO: use with_capacity here
+            maybe_all_samples = Some(Vec::new());
+        }
+
+        let mut samples: Vec<usize> = (0..n_rows).map(|_| 1).collect();
+
+        for _ in 0..parameters.n_trees {
+            if parameters.bootstrap {
+                samples =
+                    ForestRegressor::<TX, TY, X, Y>::sample_with_replacement(n_rows, &mut rng);
+            }
+
+            // keep samples is flag is on
+            if let Some(ref mut all_samples) = maybe_all_samples {
+                all_samples.push(samples.iter().map(|x| *x != 0).collect())
+            }
+
+            let params = DecisionTreeRegressorParameters {
+                max_depth: parameters.max_depth,
+                min_samples_leaf: parameters.min_samples_leaf,
+                min_samples_split: parameters.min_samples_split,
+                seed: Some(parameters.seed),
+            };
+            let tree = DecisionTreeRegressor::fit_weak_learner(
+                x,
+                y,
+                samples.clone(),
+                mtry,
+                params,
+                parameters.use_random_splits,
+            )?;
+            trees.push(tree);
+        }
+
+        Ok(ForestRegressor {
+            trees: Some(trees),
+            samples: maybe_all_samples,
+        })
+    }
+
+    /// Predict class for `x`
+    /// * `x` - _KxM_ data where _K_ is number of observations and _M_ is number of features.
+    pub fn predict(&self, x: &X) -> Result<Y, Failed> {
+        let mut result = Y::zeros(x.shape().0);
+
+        let (n, _) = x.shape();
+
+        for i in 0..n {
+            result.set(i, self.predict_for_row(x, i));
+        }
+
+        Ok(result)
+    }
+
+    fn predict_for_row(&self, x: &X, row: usize) -> TY {
+        let n_trees = self.trees.as_ref().unwrap().len();
+
+        let mut result = TY::zero();
+
+        for tree in self.trees.as_ref().unwrap().iter() {
+            result += tree.predict_for_row(x, row);
+        }
+
+        result / TY::from_usize(n_trees).unwrap()
+    }
+
+    /// Predict OOB classes for `x`. `x` is expected to be equal to the dataset used in training.
+    pub fn predict_oob(&self, x: &X) -> Result<Y, Failed> {
+        let (n, _) = x.shape();
+        if self.samples.is_none() {
+            Err(Failed::because(
+                FailedError::PredictFailed,
+                "Need samples=true for OOB predictions.",
+            ))
+        } else if self.samples.as_ref().unwrap()[0].len() != n {
+            Err(Failed::because(
+                FailedError::PredictFailed,
+                "Prediction matrix must match matrix used in training for OOB predictions.",
+            ))
+        } else {
+            let mut result = Y::zeros(n);
+
+            for i in 0..n {
+                result.set(i, self.predict_for_row_oob(x, i));
+            }
+
+            Ok(result)
+        }
+    }
+
+    fn predict_for_row_oob(&self, x: &X, row: usize) -> TY {
+        let mut n_trees = 0;
+        let mut result = TY::zero();
+
+        for (tree, samples) in self
+            .trees
+            .as_ref()
+            .unwrap()
+            .iter()
+            .zip(self.samples.as_ref().unwrap())
+        {
+            if !samples[row] {
+                result += tree.predict_for_row(x, row);
+                n_trees += 1;
+            }
+        }
+
+        // TODO: What to do if there are no oob trees?
+        result / TY::from(n_trees).unwrap()
+    }
+
+    fn sample_with_replacement(nrows: usize, rng: &mut impl Rng) -> Vec<usize> {
+        let mut samples = vec![0; nrows];
+        for _ in 0..nrows {
+            let xi = rng.gen_range(0..nrows);
+            samples[xi] += 1;
+        }
+        samples
+    }
+}
diff --git a/src/ensemble/mod.rs b/src/ensemble/mod.rs
index 8cebd5c5..45c35daf 100644
--- a/src/ensemble/mod.rs
+++ b/src/ensemble/mod.rs
@@ -16,6 +16,8 @@
 //!
 //! * ["An Introduction to Statistical Learning", James G., Witten D., Hastie T., Tibshirani R., 8.2 Bagging, Random Forests, Boosting](http://faculty.marshall.usc.edu/gareth-james/ISL/)
 
+pub mod extra_trees_regressor;
+mod forest_regressor;
 /// Random forest classifier
 pub mod random_forest_classifier;
 /// Random forest regressor
diff --git a/src/ensemble/random_forest_regressor.rs b/src/ensemble/random_forest_regressor.rs
index efc63d3d..b597b5b2 100644
--- a/src/ensemble/random_forest_regressor.rs
+++ b/src/ensemble/random_forest_regressor.rs
@@ -43,7 +43,6 @@
 //! <script src="https://polyfill.io/v3/polyfill.min.js?features=es6"></script>
 //! <script id="MathJax-script" async src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js"></script>
 
-use rand::Rng;
 use std::default::Default;
 use std::fmt::Debug;
 
@@ -51,15 +50,13 @@ use std::fmt::Debug;
 use serde::{Deserialize, Serialize};
 
 use crate::api::{Predictor, SupervisedEstimator};
-use crate::error::{Failed, FailedError};
+use crate::ensemble::forest_regressor::ForestRegressorParameters;
+use crate::error::Failed;
 use crate::linalg::basic::arrays::{Array1, Array2};
 use crate::numbers::basenum::Number;
 use crate::numbers::floatnum::FloatNumber;
 
-use crate::rand_custom::get_rng_impl;
-use crate::tree::decision_tree_regressor::{
-    DecisionTreeRegressor, DecisionTreeRegressorParameters,
-};
+use super::forest_regressor::ForestRegressor;
 
 #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
 #[derive(Debug, Clone)]
@@ -98,8 +95,7 @@ pub struct RandomForestRegressor<
     X: Array2<TX>,
     Y: Array1<TY>,
 > {
-    trees: Option<Vec<DecisionTreeRegressor<TX, TY, X, Y>>>,
-    samples: Option<Vec<Vec<bool>>>,
+    forest_regressor: Option<ForestRegressor<TX, TY, X, Y>>,
 }
 
 impl RandomForestRegressorParameters {
@@ -159,14 +155,7 @@ impl<TX: Number + FloatNumber + PartialOrd, TY: Number, X: Array2<TX>, Y: Array1
     for RandomForestRegressor<TX, TY, X, Y>
 {
     fn eq(&self, other: &Self) -> bool {
-        if self.trees.as_ref().unwrap().len() != other.trees.as_ref().unwrap().len() {
-            false
-        } else {
-            self.trees
-                .iter()
-                .zip(other.trees.iter())
-                .all(|(a, b)| a == b)
-        }
+        self.forest_regressor == other.forest_regressor
     }
 }
 
@@ -176,8 +165,7 @@ impl<TX: Number + FloatNumber + PartialOrd, TY: Number, X: Array2<TX>, Y: Array1
 {
     fn new() -> Self {
         Self {
-            trees: Option::None,
-            samples: Option::None,
+            forest_regressor: Option::None,
         }
     }
 
@@ -397,128 +385,35 @@ impl<TX: Number + FloatNumber + PartialOrd, TY: Number, X: Array2<TX>, Y: Array1
         y: &Y,
         parameters: RandomForestRegressorParameters,
     ) -> Result<RandomForestRegressor<TX, TY, X, Y>, Failed> {
-        let (n_rows, num_attributes) = x.shape();
-
-        if n_rows != y.shape() {
-            return Err(Failed::fit("Number of rows in X should = len(y)"));
-        }
-
-        let mtry = parameters
-            .m
-            .unwrap_or((num_attributes as f64).sqrt().floor() as usize);
-
-        let mut rng = get_rng_impl(Some(parameters.seed));
-        let mut trees: Vec<DecisionTreeRegressor<TX, TY, X, Y>> = Vec::new();
-
-        let mut maybe_all_samples: Option<Vec<Vec<bool>>> = Option::None;
-        if parameters.keep_samples {
-            // TODO: use with_capacity here
-            maybe_all_samples = Some(Vec::new());
-        }
-
-        for _ in 0..parameters.n_trees {
-            let samples: Vec<usize> =
-                RandomForestRegressor::<TX, TY, X, Y>::sample_with_replacement(n_rows, &mut rng);
-
-            // keep samples is flag is on
-            if let Some(ref mut all_samples) = maybe_all_samples {
-                all_samples.push(samples.iter().map(|x| *x != 0).collect())
-            }
-
-            let params = DecisionTreeRegressorParameters {
-                max_depth: parameters.max_depth,
-                min_samples_leaf: parameters.min_samples_leaf,
-                min_samples_split: parameters.min_samples_split,
-                seed: Some(parameters.seed),
-            };
-            let tree = DecisionTreeRegressor::fit_weak_learner(x, y, samples, mtry, params)?;
-            trees.push(tree);
-        }
+        let regressor_params = ForestRegressorParameters {
+            max_depth: parameters.max_depth,
+            min_samples_leaf: parameters.min_samples_leaf,
+            min_samples_split: parameters.min_samples_split,
+            n_trees: parameters.n_trees,
+            m: parameters.m,
+            keep_samples: parameters.keep_samples,
+            seed: parameters.seed,
+            bootstrap: true,
+            use_random_splits: false,
+        };
+        let forest_regressor = ForestRegressor::fit(x, y, regressor_params)?;
 
         Ok(RandomForestRegressor {
-            trees: Some(trees),
-            samples: maybe_all_samples,
+            forest_regressor: Some(forest_regressor),
         })
     }
 
     /// Predict class for `x`
     /// * `x` - _KxM_ data where _K_ is number of observations and _M_ is number of features.
     pub fn predict(&self, x: &X) -> Result<Y, Failed> {
-        let mut result = Y::zeros(x.shape().0);
-
-        let (n, _) = x.shape();
-
-        for i in 0..n {
-            result.set(i, self.predict_for_row(x, i));
-        }
-
-        Ok(result)
-    }
-
-    fn predict_for_row(&self, x: &X, row: usize) -> TY {
-        let n_trees = self.trees.as_ref().unwrap().len();
-
-        let mut result = TY::zero();
-
-        for tree in self.trees.as_ref().unwrap().iter() {
-            result += tree.predict_for_row(x, row);
-        }
-
-        result / TY::from_usize(n_trees).unwrap()
+        let forest_regressor = self.forest_regressor.as_ref().unwrap();
+        forest_regressor.predict(x)
     }
 
     /// Predict OOB classes for `x`. `x` is expected to be equal to the dataset used in training.
     pub fn predict_oob(&self, x: &X) -> Result<Y, Failed> {
-        let (n, _) = x.shape();
-        if self.samples.is_none() {
-            Err(Failed::because(
-                FailedError::PredictFailed,
-                "Need samples=true for OOB predictions.",
-            ))
-        } else if self.samples.as_ref().unwrap()[0].len() != n {
-            Err(Failed::because(
-                FailedError::PredictFailed,
-                "Prediction matrix must match matrix used in training for OOB predictions.",
-            ))
-        } else {
-            let mut result = Y::zeros(n);
-
-            for i in 0..n {
-                result.set(i, self.predict_for_row_oob(x, i));
-            }
-
-            Ok(result)
-        }
-    }
-
-    fn predict_for_row_oob(&self, x: &X, row: usize) -> TY {
-        let mut n_trees = 0;
-        let mut result = TY::zero();
-
-        for (tree, samples) in self
-            .trees
-            .as_ref()
-            .unwrap()
-            .iter()
-            .zip(self.samples.as_ref().unwrap())
-        {
-            if !samples[row] {
-                result += tree.predict_for_row(x, row);
-                n_trees += 1;
-            }
-        }
-
-        // TODO: What to do if there are no oob trees?
-        result / TY::from(n_trees).unwrap()
-    }
-
-    fn sample_with_replacement(nrows: usize, rng: &mut impl Rng) -> Vec<usize> {
-        let mut samples = vec![0; nrows];
-        for _ in 0..nrows {
-            let xi = rng.gen_range(0..nrows);
-            samples[xi] += 1;
-        }
-        samples
+        let forest_regressor = self.forest_regressor.as_ref().unwrap();
+        forest_regressor.predict_oob(x)
     }
 }
 
diff --git a/src/lib.rs b/src/lib.rs
index c68368fa..c6f9349c 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -130,6 +130,5 @@ pub mod readers;
 pub mod svm;
 /// Supervised tree-based learning methods
 pub mod tree;
-pub mod xgboost;
 
 pub(crate) mod rand_custom;
diff --git a/src/tree/decision_tree_regressor.rs b/src/tree/decision_tree_regressor.rs
index d735697d..af4e2748 100644
--- a/src/tree/decision_tree_regressor.rs
+++ b/src/tree/decision_tree_regressor.rs
@@ -426,7 +426,7 @@ impl<TX: Number + PartialOrd, TY: Number, X: Array2<TX>, Y: Array1<TY>>
         }
 
         let samples = vec![1; x_nrows];
-        DecisionTreeRegressor::fit_weak_learner(x, y, samples, num_attributes, parameters)
+        DecisionTreeRegressor::fit_weak_learner(x, y, samples, num_attributes, parameters, false)
     }
 
     pub(crate) fn fit_weak_learner(
@@ -435,6 +435,7 @@ impl<TX: Number + PartialOrd, TY: Number, X: Array2<TX>, Y: Array1<TY>>
         samples: Vec<usize>,
         mtry: usize,
         parameters: DecisionTreeRegressorParameters,
+        use_random_splits: bool,
     ) -> Result<DecisionTreeRegressor<TX, TY, X, Y>, Failed> {
         let y_m = y.clone();
 
@@ -474,13 +475,15 @@ impl<TX: Number + PartialOrd, TY: Number, X: Array2<TX>, Y: Array1<TY>>
 
         let mut visitor_queue: LinkedList<NodeVisitor<'_, TX, TY, X, Y>> = LinkedList::new();
 
-        if tree.find_best_cutoff(&mut visitor, mtry, &mut rng) {
+        if tree.find_best_cutoff(&mut visitor, mtry, &mut rng, use_random_splits) {
             visitor_queue.push_back(visitor);
         }
 
         while tree.depth() < tree.parameters().max_depth.unwrap_or(u16::MAX) {
             match visitor_queue.pop_front() {
-                Some(node) => tree.split(node, mtry, &mut visitor_queue, &mut rng),
+                Some(node) => {
+                    tree.split(node, mtry, &mut visitor_queue, &mut rng, use_random_splits)
+                }
                 None => break,
             };
         }
@@ -534,6 +537,7 @@ impl<TX: Number + PartialOrd, TY: Number, X: Array2<TX>, Y: Array1<TY>>
         visitor: &mut NodeVisitor<'_, TX, TY, X, Y>,
         mtry: usize,
         rng: &mut impl Rng,
+        use_random_splits: bool,
     ) -> bool {
         let (_, n_attr) = visitor.x.shape();
 
@@ -555,7 +559,15 @@ impl<TX: Number + PartialOrd, TY: Number, X: Array2<TX>, Y: Array1<TY>>
             n as f64 * self.nodes()[visitor.node].output * self.nodes()[visitor.node].output;
 
         for variable in variables.iter().take(mtry) {
-            self.find_best_split(visitor, n, sum, parent_gain, *variable);
+            self.find_best_split(
+                visitor,
+                n,
+                sum,
+                parent_gain,
+                *variable,
+                rng,
+                use_random_splits,
+            );
         }
 
         self.nodes()[visitor.node].split_score.is_some()
@@ -568,65 +580,137 @@ impl<TX: Number + PartialOrd, TY: Number, X: Array2<TX>, Y: Array1<TY>>
         sum: f64,
         parent_gain: f64,
         j: usize,
+        rng: &mut impl Rng,
+        use_random_splits: bool,
     ) {
-        let mut true_sum = 0f64;
-        let mut true_count = 0;
-        let mut prevx = Option::None;
-
-        for i in visitor.order[j].iter() {
-            if visitor.samples[*i] > 0 {
-                let x_ij = *visitor.x.get((*i, j));
-
-                if prevx.is_none() || x_ij == prevx.unwrap() {
-                    prevx = Some(x_ij);
-                    true_count += visitor.samples[*i];
-                    true_sum += visitor.samples[*i] as f64 * visitor.y.get(*i).to_f64().unwrap();
-                    continue;
+        if use_random_splits {
+            // --- EFFICIENT LOGIC for Continuous Random Splits ---
+            let (min_val, max_val) = {
+                let mut min_opt = None;
+                let mut max_opt = None;
+                for &i in &visitor.order[j] {
+                    if visitor.samples[i] > 0 {
+                        min_opt = Some(*visitor.x.get((i, j)));
+                        break;
+                    }
+                }
+                for &i in visitor.order[j].iter().rev() {
+                    if visitor.samples[i] > 0 {
+                        max_opt = Some(*visitor.x.get((i, j)));
+                        break;
+                    }
                 }
+                if min_opt.is_none() {
+                    return;
+                }
+                (min_opt.unwrap(), max_opt.unwrap())
+            };
 
-                let false_count = n - true_count;
+            if min_val >= max_val {
+                return;
+            }
 
-                if true_count < self.parameters().min_samples_leaf
-                    || false_count < self.parameters().min_samples_leaf
-                {
-                    prevx = Some(x_ij);
-                    true_count += visitor.samples[*i];
-                    true_sum += visitor.samples[*i] as f64 * visitor.y.get(*i).to_f64().unwrap();
-                    continue;
-                }
+            let split_value = rng.gen_range(min_val.to_f64().unwrap()..max_val.to_f64().unwrap());
 
-                let true_mean = true_sum / true_count as f64;
-                let false_mean = (sum - true_sum) / false_count as f64;
+            let mut true_sum = 0f64;
+            let mut true_count = 0;
+            for &i in &visitor.order[j] {
+                if visitor.samples[i] > 0 {
+                    if visitor.x.get((i, j)).to_f64().unwrap() <= split_value {
+                        true_sum += visitor.samples[i] as f64 * visitor.y.get(i).to_f64().unwrap();
+                        true_count += visitor.samples[i];
+                    } else {
+                        break;
+                    }
+                }
+            }
 
-                let gain = (true_count as f64 * true_mean * true_mean
-                    + false_count as f64 * false_mean * false_mean)
-                    - parent_gain;
+            let false_count = n - true_count;
 
-                if self.nodes()[visitor.node].split_score.is_none()
-                    || gain > self.nodes()[visitor.node].split_score.unwrap()
-                {
-                    self.nodes[visitor.node].split_feature = j;
-                    self.nodes[visitor.node].split_value =
-                        Option::Some((x_ij + prevx.unwrap()).to_f64().unwrap() / 2f64);
-                    self.nodes[visitor.node].split_score = Option::Some(gain);
+            if true_count < self.parameters().min_samples_leaf
+                || false_count < self.parameters().min_samples_leaf
+            {
+                return;
+            }
 
-                    visitor.true_child_output = true_mean;
-                    visitor.false_child_output = false_mean;
+            let true_mean = if true_count > 0 {
+                true_sum / true_count as f64
+            } else {
+                0.0
+            };
+            let false_mean = if false_count > 0 {
+                (sum - true_sum) / false_count as f64
+            } else {
+                0.0
+            };
+            let gain = (true_count as f64 * true_mean * true_mean
+                + false_count as f64 * false_mean * false_mean)
+                - parent_gain;
+
+            if self.nodes[visitor.node].split_score.is_none()
+                || gain > self.nodes[visitor.node].split_score.unwrap()
+            {
+                self.nodes[visitor.node].split_feature = j;
+                self.nodes[visitor.node].split_value = Some(split_value);
+                self.nodes[visitor.node].split_score = Some(gain);
+                visitor.true_child_output = true_mean;
+                visitor.false_child_output = false_mean;
+            }
+        } else {
+            let mut true_sum = 0f64;
+            let mut true_count = 0;
+            let mut prevx = Option::None;
+            let order = &visitor.order[j];
+
+            for i in order.iter() {
+                if visitor.samples[*i] > 0 {
+                    let x_ij = *visitor.x.get((*i, j));
+                    if prevx.is_none() || x_ij == prevx.unwrap() {
+                        prevx = Some(x_ij);
+                        true_count += visitor.samples[*i];
+                        true_sum +=
+                            visitor.samples[*i] as f64 * visitor.y.get(*i).to_f64().unwrap();
+                        continue;
+                    }
+                    let false_count = n - true_count;
+                    if true_count < self.parameters().min_samples_leaf
+                        || false_count < self.parameters().min_samples_leaf
+                    {
+                        prevx = Some(x_ij);
+                        true_count += visitor.samples[*i];
+                        true_sum +=
+                            visitor.samples[*i] as f64 * visitor.y.get(*i).to_f64().unwrap();
+                        continue;
+                    }
+                    let true_mean = true_sum / true_count as f64;
+                    let false_mean = (sum - true_sum) / false_count as f64;
+                    let gain = (true_count as f64 * true_mean * true_mean
+                        + false_count as f64 * false_mean * false_mean)
+                        - parent_gain;
+                    if self.nodes[visitor.node].split_score.is_none()
+                        || gain > self.nodes[visitor.node].split_score.unwrap()
+                    {
+                        self.nodes[visitor.node].split_feature = j;
+                        self.nodes[visitor.node].split_value =
+                            Option::Some((x_ij + prevx.unwrap()).to_f64().unwrap() / 2f64);
+                        self.nodes[visitor.node].split_score = Some(gain);
+                        visitor.true_child_output = true_mean;
+                        visitor.false_child_output = false_mean;
+                    }
+                    prevx = Some(x_ij);
+                    true_sum += visitor.samples[*i] as f64 * visitor.y.get(*i).to_f64().unwrap();
+                    true_count += visitor.samples[*i];
                 }
-
-                prevx = Some(x_ij);
-                true_sum += visitor.samples[*i] as f64 * visitor.y.get(*i).to_f64().unwrap();
-                true_count += visitor.samples[*i];
             }
         }
     }
-
     fn split<'a>(
         &mut self,
         mut visitor: NodeVisitor<'a, TX, TY, X, Y>,
         mtry: usize,
         visitor_queue: &mut LinkedList<NodeVisitor<'a, TX, TY, X, Y>>,
         rng: &mut impl Rng,
+        use_random_splits: bool,
     ) -> bool {
         let (n, _) = visitor.x.shape();
         let mut tc = 0;
@@ -679,7 +763,7 @@ impl<TX: Number + PartialOrd, TY: Number, X: Array2<TX>, Y: Array1<TY>>
             visitor.level + 1,
         );
 
-        if self.find_best_cutoff(&mut true_visitor, mtry, rng) {
+        if self.find_best_cutoff(&mut true_visitor, mtry, rng, use_random_splits) {
             visitor_queue.push_back(true_visitor);
         }
 
@@ -692,7 +776,7 @@ impl<TX: Number + PartialOrd, TY: Number, X: Array2<TX>, Y: Array1<TY>>
             visitor.level + 1,
         );
 
-        if self.find_best_cutoff(&mut false_visitor, mtry, rng) {
+        if self.find_best_cutoff(&mut false_visitor, mtry, rng, use_random_splits) {
             visitor_queue.push_back(false_visitor);
         }
 
diff --git a/src/xgboost/mod.rs b/src/xgboost/mod.rs
deleted file mode 100644
index 68e087be..00000000
--- a/src/xgboost/mod.rs
+++ /dev/null
@@ -1,16 +0,0 @@
-//! # XGBoost
-//!
-//! XGBoost, which stands for Extreme Gradient Boosting, is a powerful and efficient implementation of the gradient boosting framework. Gradient boosting is a machine learning technique for regression and classification problems, which produces a prediction model in the form of an ensemble of weak prediction models, typically decision trees.
-//!
-//! The core idea of boosting is to build the model in a stage-wise fashion. It learns from its mistakes by sequentially adding new models that correct the errors of the previous ones. Unlike bagging, which trains models in parallel, boosting is a sequential process. Each new tree is fit on a modified version of the original data set, specifically focusing on the instances where the previous models performed poorly.
-//!
-//! XGBoost enhances this process through several key innovations. It employs a more regularized model formalization to control over-fitting, which gives it better performance. It also has a highly optimized and parallelized tree construction process, making it significantly faster and more scalable than traditional gradient boosting implementations.
-//!
-//! ## References:
-//!
-//! * "Elements of Statistical Learning", Hastie T., Tibshirani R., Friedman J., 10. Boosting and Additive Trees
-//! *  XGBoost: A Scalable Tree Boosting System, Chen T., Guestrin C.
-
-// xgboost implementation
-pub mod xgb_regressor;
-pub use xgb_regressor::{XGRegressor, XGRegressorParameters};
diff --git a/src/xgboost/xgb_regressor.rs b/src/xgboost/xgb_regressor.rs
deleted file mode 100644
index ac6ec752..00000000
--- a/src/xgboost/xgb_regressor.rs
+++ /dev/null
@@ -1,762 +0,0 @@
-//! # Extreme Gradient Boosting (XGBoost)
-//!
-//! XGBoost is a highly efficient and effective implementation of the gradient boosting framework.
-//! Like other boosting models, it builds an ensemble of sequential decision trees, where each new tree
-//! is trained to correct the errors of the previous ones.
-//!
-//! What makes XGBoost powerful is its use of both the first and second derivatives (gradient and hessian)
-//! of the loss function, which allows for more accurate approximations and faster convergence. It also
-//! includes built-in regularization techniques (L1/`alpha` and L2/`lambda`) to prevent overfitting.
-//!
-//! This implementation was ported to Rust from the concepts and algorithm explained in the blog post
-//! ["XGBoost from Scratch"](https://randomrealizations.com/posts/xgboost-from-scratch/). It is designed
-//! to be a general-purpose regressor that can be used with any objective function that provides a gradient
-//! and a hessian.
-//!
-//! Example:
-//!
-//! ```
-//! use smartcore::linalg::basic::matrix::DenseMatrix;
-//! use smartcore::xgboost::{XGRegressor, XGRegressorParameters};
-//!
-//! // Simple dataset: predict y = 2*x
-//! let x = DenseMatrix::from_2d_array(&[
-//!     &[1.0], &[2.0], &[3.0], &[4.0], &[5.0]
-//! ]).unwrap();
-//! let y = vec![2.0, 4.0, 6.0, 8.0, 10.0];
-//!
-//! // Use default parameters, but set a few for demonstration
-//! let parameters = XGRegressorParameters::default()
-//!     .with_n_estimators(50)
-//!     .with_max_depth(3)
-//!     .with_learning_rate(0.1);
-//!
-//! // Train the model
-//! let model = XGRegressor::fit(&x, &y, parameters).unwrap();
-//!
-//! // Make predictions
-//! let x_test = DenseMatrix::from_2d_array(&[&[6.0], &[7.0]]).unwrap();
-//! let y_hat = model.predict(&x_test).unwrap();
-//!
-//! // y_hat should be close to [12.0, 14.0]
-//! ```
-//!
-
-use rand::{seq::SliceRandom, Rng};
-use std::{iter::zip, marker::PhantomData};
-
-use crate::{
-    api::{PredictorBorrow, SupervisedEstimatorBorrow},
-    error::{Failed, FailedError},
-    linalg::basic::arrays::{Array1, Array2},
-    numbers::basenum::Number,
-    rand_custom::get_rng_impl,
-};
-
-/// Defines the objective function to be optimized.
-/// The objective function provides the loss, gradient (first derivative), and
-/// hessian (second derivative) required for the XGBoost algorithm.
-#[derive(Clone, Debug)]
-pub enum Objective {
-    /// The objective for regression tasks using Mean Squared Error.
-    /// Loss: 0.5 * (y_true - y_pred)^2
-    MeanSquaredError,
-}
-
-impl Objective {
-    /// Calculates the loss for each sample given the true and predicted values.
-    ///
-    /// # Arguments
-    /// * `y_true` - A vector of the true target values.
-    /// * `y_pred` - A vector of the predicted values.
-    ///
-    /// # Returns
-    /// The mean of the calculated loss values.
-    pub fn loss_function<TY: Number, Y: Array1<TY>>(&self, y_true: &Y, y_pred: &Vec<f64>) -> f64 {
-        match self {
-            Objective::MeanSquaredError => {
-                zip(y_true.iterator(0), y_pred)
-                    .map(|(true_val, pred_val)| {
-                        0.5 * (true_val.to_f64().unwrap() - pred_val).powi(2)
-                    })
-                    .sum::<f64>()
-                    / y_true.shape() as f64
-            }
-        }
-    }
-
-    /// Calculates the gradient (first derivative) of the loss function.
-    ///
-    /// # Arguments
-    /// * `y_true` - A vector of the true target values.
-    /// * `y_pred` - A vector of the predicted values.
-    ///
-    /// # Returns
-    /// A vector of gradients for each sample.
-    pub fn gradient<TY: Number, Y: Array1<TY>>(&self, y_true: &Y, y_pred: &Vec<f64>) -> Vec<f64> {
-        match self {
-            Objective::MeanSquaredError => zip(y_true.iterator(0), y_pred)
-                .map(|(true_val, pred_val)| (*pred_val - true_val.to_f64().unwrap()))
-                .collect(),
-        }
-    }
-
-    /// Calculates the hessian (second derivative) of the loss function.
-    ///
-    /// # Arguments
-    /// * `y_true` - A vector of the true target values.
-    /// * `y_pred` - A vector of the predicted values.
-    ///
-    /// # Returns
-    /// A vector of hessians for each sample.
-    #[allow(unused_variables)]
-    pub fn hessian<TY: Number, Y: Array1<TY>>(&self, y_true: &Y, y_pred: &[f64]) -> Vec<f64> {
-        match self {
-            Objective::MeanSquaredError => vec![1.0; y_true.shape()],
-        }
-    }
-}
-
-/// Represents a single decision tree in the XGBoost ensemble.
-///
-/// This is a recursive data structure where each `TreeRegressor` is a node
-/// that can have a left and a right child, also of type `TreeRegressor`.
-#[allow(dead_code)]
-struct TreeRegressor<TX: Number + PartialOrd, TY: Number, X: Array2<TX>, Y: Array1<TY>> {
-    left: Option<Box<TreeRegressor<TX, TY, X, Y>>>,
-    right: Option<Box<TreeRegressor<TX, TY, X, Y>>>,
-    /// The output value of this node. If it's a leaf, this is the final prediction.
-    value: f64,
-    /// The feature value threshold used to split this node.
-    threshold: f64,
-    /// The index of the feature used for splitting.
-    split_feature_idx: usize,
-    /// The gain in score achieved by this split.
-    split_score: f64,
-    _phantom_tx: PhantomData<TX>,
-    _phantom_ty: PhantomData<TY>,
-    _phantom_x: PhantomData<X>,
-    _phantom_y: PhantomData<Y>,
-}
-
-impl<TX: Number + PartialOrd, TY: Number, X: Array2<TX>, Y: Array1<TY>>
-    TreeRegressor<TX, TY, X, Y>
-{
-    /// Recursively builds a decision tree (a `TreeRegressor` node).
-    ///
-    /// This function determines the optimal split for the given set of samples (`idxs`)
-    /// and then recursively calls itself to build the left and right child nodes.
-    ///
-    /// # Arguments
-    /// * `data` - The full training dataset.
-    /// * `g` - Gradients for all samples.
-    /// * `h` - Hessians for all samples.
-    /// * `idxs` - The indices of the samples belonging to the current node.
-    /// * `max_depth` - The maximum remaining depth for this branch.
-    /// * `min_child_weight` - The minimum sum of hessians required in a child node.
-    /// * `lambda` - L2 regularization term on weights.
-    /// * `gamma` - Minimum loss reduction required to make a further partition.
-    pub fn fit(
-        data: &X,
-        g: &Vec<f64>,
-        h: &Vec<f64>,
-        idxs: &[usize],
-        max_depth: u16,
-        min_child_weight: f64,
-        lambda: f64,
-        gamma: f64,
-    ) -> Self {
-        let g_sum = idxs.iter().map(|&i| g[i]).sum::<f64>();
-        let h_sum = idxs.iter().map(|&i| h[i]).sum::<f64>();
-        let value = -g_sum / (h_sum + lambda);
-
-        let mut best_feature_idx = usize::MAX;
-        let mut best_split_score = 0.0;
-        let mut best_threshold = 0.0;
-        let mut left = Option::None;
-        let mut right = Option::None;
-
-        if max_depth > 0 {
-            Self::insert_child_nodes(
-                data,
-                g,
-                h,
-                idxs,
-                &mut best_feature_idx,
-                &mut best_split_score,
-                &mut best_threshold,
-                &mut left,
-                &mut right,
-                max_depth,
-                min_child_weight,
-                lambda,
-                gamma,
-            );
-        }
-
-        Self {
-            left,
-            right,
-            value,
-            threshold: best_threshold,
-            split_feature_idx: best_feature_idx,
-            split_score: best_split_score,
-            _phantom_tx: PhantomData,
-            _phantom_ty: PhantomData,
-            _phantom_x: PhantomData,
-            _phantom_y: PhantomData,
-        }
-    }
-
-    /// Finds the best split and creates child nodes if a valid split is found.
-    fn insert_child_nodes(
-        data: &X,
-        g: &Vec<f64>,
-        h: &Vec<f64>,
-        idxs: &[usize],
-        best_feature_idx: &mut usize,
-        best_split_score: &mut f64,
-        best_threshold: &mut f64,
-        left: &mut Option<Box<Self>>,
-        right: &mut Option<Box<Self>>,
-        max_depth: u16,
-        min_child_weight: f64,
-        lambda: f64,
-        gamma: f64,
-    ) {
-        let (_, n_features) = data.shape();
-        for i in 0..n_features {
-            Self::find_best_split(
-                data,
-                g,
-                h,
-                idxs,
-                i,
-                best_feature_idx,
-                best_split_score,
-                best_threshold,
-                min_child_weight,
-                lambda,
-                gamma,
-            );
-        }
-
-        // A split is only valid if it results in a positive gain.
-        if *best_split_score > 0.0 {
-            let mut left_idxs = Vec::new();
-            let mut right_idxs = Vec::new();
-            for idx in idxs.iter() {
-                if data.get((*idx, *best_feature_idx)).to_f64().unwrap() <= *best_threshold {
-                    left_idxs.push(*idx);
-                } else {
-                    right_idxs.push(*idx);
-                }
-            }
-
-            *left = Some(Box::new(TreeRegressor::fit(
-                data,
-                g,
-                h,
-                &left_idxs,
-                max_depth - 1,
-                min_child_weight,
-                lambda,
-                gamma,
-            )));
-            *right = Some(Box::new(TreeRegressor::fit(
-                data,
-                g,
-                h,
-                &right_idxs,
-                max_depth - 1,
-                min_child_weight,
-                lambda,
-                gamma,
-            )));
-        }
-    }
-
-    /// Iterates through a single feature to find the best possible split point.
-    fn find_best_split(
-        data: &X,
-        g: &[f64],
-        h: &[f64],
-        idxs: &[usize],
-        feature_idx: usize,
-        best_feature_idx: &mut usize,
-        best_split_score: &mut f64,
-        best_threshold: &mut f64,
-        min_child_weight: f64,
-        lambda: f64,
-        gamma: f64,
-    ) {
-        let mut sorted_idxs = idxs.to_owned();
-        sorted_idxs.sort_by(|a, b| {
-            data.get((*a, feature_idx))
-                .partial_cmp(data.get((*b, feature_idx)))
-                .unwrap()
-        });
-
-        let sum_g = sorted_idxs.iter().map(|&i| g[i]).sum::<f64>();
-        let sum_h = sorted_idxs.iter().map(|&i| h[i]).sum::<f64>();
-
-        let mut sum_g_right = sum_g;
-        let mut sum_h_right = sum_h;
-        let mut sum_g_left = 0.0;
-        let mut sum_h_left = 0.0;
-
-        for i in 0..sorted_idxs.len() - 1 {
-            let idx = sorted_idxs[i];
-            let next_idx = sorted_idxs[i + 1];
-
-            let g_i = g[idx];
-            let h_i = h[idx];
-            let x_i = data.get((idx, feature_idx)).to_f64().unwrap();
-            let x_i_next = data.get((next_idx, feature_idx)).to_f64().unwrap();
-
-            sum_g_left += g_i;
-            sum_h_left += h_i;
-            sum_g_right -= g_i;
-            sum_h_right -= h_i;
-
-            if sum_h_left < min_child_weight || x_i == x_i_next {
-                continue;
-            }
-            if sum_h_right < min_child_weight {
-                break;
-            }
-
-            let gain = 0.5
-                * ((sum_g_left * sum_g_left / (sum_h_left + lambda))
-                    + (sum_g_right * sum_g_right / (sum_h_right + lambda))
-                    - (sum_g * sum_g / (sum_h + lambda)))
-                - gamma;
-
-            if gain > *best_split_score {
-                *best_split_score = gain;
-                *best_threshold = (x_i + x_i_next) / 2.0;
-                *best_feature_idx = feature_idx;
-            }
-        }
-    }
-
-    /// Predicts the output values for a dataset.
-    pub fn predict(&self, data: &X) -> Vec<f64> {
-        let (n_samples, n_features) = data.shape();
-        (0..n_samples)
-            .map(|i| {
-                self.predict_for_row(&Vec::from_iterator(
-                    data.get_row(i).iterator(0).copied(),
-                    n_features,
-                ))
-            })
-            .collect()
-    }
-
-    /// Predicts the output value for a single row of data by traversing the tree.
-    pub fn predict_for_row(&self, row: &Vec<TX>) -> f64 {
-        // A leaf node is identified by having no children.
-        if self.left.is_none() {
-            return self.value;
-        }
-
-        // Recurse down the appropriate branch.
-        let child = if row[self.split_feature_idx].to_f64().unwrap() <= self.threshold {
-            self.left.as_ref().unwrap()
-        } else {
-            self.right.as_ref().unwrap()
-        };
-
-        child.predict_for_row(row)
-    }
-}
-
-/// Parameters for the `jRegressor` model.
-///
-/// This struct holds all the hyperparameters that control the training process.
-#[derive(Clone, Debug)]
-pub struct XGRegressorParameters {
-    /// The number of boosting rounds or trees to build.
-    pub n_estimators: usize,
-    /// The maximum depth of each tree.
-    pub max_depth: u16,
-    /// Step size shrinkage used to prevent overfitting.
-    pub learning_rate: f64,
-    /// Minimum sum of instance weight (hessian) needed in a child.
-    pub min_child_weight: usize,
-    /// L2 regularization term on weights.
-    pub lambda: f64,
-    /// Minimum loss reduction required to make a further partition on a leaf node.
-    pub gamma: f64,
-    /// The initial prediction score for all instances.
-    pub base_score: f64,
-    /// The fraction of samples to be used for fitting the individual base learners.
-    pub subsample: f64,
-    /// The seed for the random number generator for reproducibility.
-    pub seed: u64,
-    /// The objective function to be optimized.
-    pub objective: Objective,
-}
-
-impl Default for XGRegressorParameters {
-    /// Creates a new set of `XGRegressorParameters` with default values.
-    fn default() -> Self {
-        Self {
-            n_estimators: 100,
-            learning_rate: 0.3,
-            max_depth: 6,
-            min_child_weight: 1,
-            lambda: 1.0,
-            gamma: 0.0,
-            base_score: 0.5,
-            subsample: 1.0,
-            seed: 0,
-            objective: Objective::MeanSquaredError,
-        }
-    }
-}
-
-// Builder pattern for XGRegressorParameters
-impl XGRegressorParameters {
-    /// Sets the number of boosting rounds or trees to build.
-    pub fn with_n_estimators(mut self, n_estimators: usize) -> Self {
-        self.n_estimators = n_estimators;
-        self
-    }
-
-    /// Sets the step size shrinkage used to prevent overfitting.
-    ///
-    /// Also known as `eta`. A smaller value makes the model more robust by preventing
-    /// too much weight being given to any single tree.
-    pub fn with_learning_rate(mut self, learning_rate: f64) -> Self {
-        self.learning_rate = learning_rate;
-        self
-    }
-
-    /// Sets the maximum depth of each individual tree.
-    // A lower value helps prevent overfitting.*
-    pub fn with_max_depth(mut self, max_depth: u16) -> Self {
-        self.max_depth = max_depth;
-        self
-    }
-
-    /// Sets the minimum sum of instance weight (hessian) needed in a child node.
-    ///
-    /// If the tree partition step results in a leaf node with the sum of
-    // instance weight less than `min_child_weight`, then the building process*
-    /// will give up further partitioning.
-    pub fn with_min_child_weight(mut self, min_child_weight: usize) -> Self {
-        self.min_child_weight = min_child_weight;
-        self
-    }
-
-    /// Sets the L2 regularization term on weights (`lambda`).
-    ///
-    /// Increasing this value will make the model more conservative.
-    pub fn with_lambda(mut self, lambda: f64) -> Self {
-        self.lambda = lambda;
-        self
-    }
-
-    /// Sets the minimum loss reduction required to make a further partition on a leaf node.
-    ///
-    /// The larger `gamma` is, the more conservative the algorithm will be.
-    pub fn with_gamma(mut self, gamma: f64) -> Self {
-        self.gamma = gamma;
-        self
-    }
-
-    /// Sets the initial prediction score for all instances.
-    pub fn with_base_score(mut self, base_score: f64) -> Self {
-        self.base_score = base_score;
-        self
-    }
-
-    /// Sets the fraction of samples to be used for fitting individual base learners.
-    ///
-    /// A value of less than 1.0 introduces randomness and helps prevent overfitting.
-    pub fn with_subsample(mut self, subsample: f64) -> Self {
-        self.subsample = subsample;
-        self
-    }
-
-    /// Sets the seed for the random number generator for reproducibility.
-    pub fn with_seed(mut self, seed: u64) -> Self {
-        self.seed = seed;
-        self
-    }
-
-    /// Sets the objective function to be optimized during training.
-    pub fn with_objective(mut self, objective: Objective) -> Self {
-        self.objective = objective;
-        self
-    }
-}
-
-/// An Extreme Gradient Boosting (XGBoost) model for regression and classification tasks.
-pub struct XGRegressor<TX: Number + PartialOrd, TY: Number, X: Array2<TX>, Y: Array1<TY>> {
-    regressors: Option<Vec<TreeRegressor<TX, TY, X, Y>>>,
-    parameters: Option<XGRegressorParameters>,
-    _phantom_ty: PhantomData<TY>,
-    _phantom_tx: PhantomData<TX>,
-    _phantom_y: PhantomData<Y>,
-    _phantom_x: PhantomData<X>,
-}
-
-impl<TX: Number + PartialOrd, TY: Number, X: Array2<TX>, Y: Array1<TY>> XGRegressor<TX, TY, X, Y> {
-    /// Fits the XGBoost model to the training data.
-    pub fn fit(data: &X, y: &Y, parameters: XGRegressorParameters) -> Result<Self, Failed> {
-        if parameters.subsample > 1.0 || parameters.subsample <= 0.0 {
-            return Err(Failed::because(
-                FailedError::ParametersError,
-                "Subsample ratio must be in (0, 1].",
-            ));
-        }
-
-        let (n_samples, _) = data.shape();
-        let learning_rate = parameters.learning_rate;
-        let mut predictions = vec![parameters.base_score; n_samples];
-
-        let mut regressors = Vec::new();
-        let mut rng = get_rng_impl(Some(parameters.seed));
-
-        for _ in 0..parameters.n_estimators {
-            let gradients = parameters.objective.gradient(y, &predictions);
-            let hessians = parameters.objective.hessian(y, &predictions);
-
-            let sample_idxs = if parameters.subsample < 1.0 {
-                Self::sample_without_replacement(n_samples, parameters.subsample, &mut rng)
-            } else {
-                (0..n_samples).collect::<Vec<usize>>()
-            };
-
-            let regressor = TreeRegressor::fit(
-                data,
-                &gradients,
-                &hessians,
-                &sample_idxs,
-                parameters.max_depth,
-                parameters.min_child_weight as f64,
-                parameters.lambda,
-                parameters.gamma,
-            );
-
-            let corrections = regressor.predict(data);
-            predictions = zip(predictions, corrections)
-                .map(|(pred, correction)| pred + (learning_rate * correction))
-                .collect();
-
-            regressors.push(regressor);
-        }
-
-        Ok(Self {
-            regressors: Some(regressors),
-            parameters: Some(parameters),
-            _phantom_ty: PhantomData,
-            _phantom_y: PhantomData,
-            _phantom_tx: PhantomData,
-            _phantom_x: PhantomData,
-        })
-    }
-
-    /// Predicts target values for the given input data.
-    pub fn predict(&self, data: &X) -> Result<Vec<TX>, Failed> {
-        let (n_samples, _) = data.shape();
-
-        let parameters = self.parameters.as_ref().unwrap();
-        let mut predictions = vec![parameters.base_score; n_samples];
-        let regressors = self.regressors.as_ref().unwrap();
-
-        for regressor in regressors.iter() {
-            let corrections = regressor.predict(data);
-            predictions = zip(predictions, corrections)
-                .map(|(pred, correction)| pred + (parameters.learning_rate * correction))
-                .collect();
-        }
-
-        Ok(predictions
-            .into_iter()
-            .map(|p| TX::from_f64(p).unwrap())
-            .collect())
-    }
-
-    /// Creates a random sample of indices without replacement.
-    fn sample_without_replacement(
-        population_size: usize,
-        subsample_ratio: f64,
-        rng: &mut impl Rng,
-    ) -> Vec<usize> {
-        let mut indices: Vec<usize> = (0..population_size).collect();
-        indices.shuffle(rng);
-        indices.truncate((population_size as f64 * subsample_ratio) as usize);
-        indices
-    }
-}
-
-// Boilerplate implementation for the smartcore traits
-impl<TX: Number + PartialOrd, TY: Number, X: Array2<TX>, Y: Array1<TY>>
-    SupervisedEstimatorBorrow<'_, X, Y, XGRegressorParameters> for XGRegressor<TX, TY, X, Y>
-{
-    fn new() -> Self {
-        Self {
-            regressors: None,
-            parameters: None,
-            _phantom_ty: PhantomData,
-            _phantom_y: PhantomData,
-            _phantom_tx: PhantomData,
-            _phantom_x: PhantomData,
-        }
-    }
-
-    fn fit(x: &X, y: &Y, parameters: &XGRegressorParameters) -> Result<Self, Failed> {
-        XGRegressor::fit(x, y, parameters.clone())
-    }
-}
-
-impl<TX: Number + PartialOrd, TY: Number, X: Array2<TX>, Y: Array1<TY>> PredictorBorrow<'_, X, TX>
-    for XGRegressor<TX, TY, X, Y>
-{
-    fn predict(&self, x: &X) -> Result<Vec<TX>, Failed> {
-        self.predict(x)
-    }
-}
-
-// ------------------- TESTS -------------------
-#[cfg(test)]
-mod tests {
-    use super::*;
-    use crate::linalg::basic::{arrays::Array, matrix::DenseMatrix};
-
-    /// Tests the gradient and hessian calculations for MeanSquaredError.
-    #[test]
-    fn test_mse_objective() {
-        let objective = Objective::MeanSquaredError;
-        let y_true = vec![1.0, 2.0, 3.0];
-        let y_pred = vec![1.5, 2.5, 2.5];
-
-        let gradients = objective.gradient(&y_true, &y_pred);
-        let hessians = objective.hessian(&y_true, &y_pred);
-
-        // Gradients should be (pred - true)
-        assert_eq!(gradients, vec![0.5, 0.5, -0.5]);
-        // Hessians should be all 1.0 for MSE
-        assert_eq!(hessians, vec![1.0, 1.0, 1.0]);
-    }
-
-    #[test]
-    fn test_find_best_split_multidimensional() {
-        // Data has two features. The second feature is a better predictor.
-        let data = vec![
-            vec![1.0, 10.0], // g = -0.5
-            vec![1.0, 20.0], // g = -1.0
-            vec![1.0, 30.0], // g = 1.0
-            vec![1.0, 40.0], // g = 1.5
-        ];
-        let data = DenseMatrix::from_2d_vec(&data).unwrap();
-        let g = vec![-0.5, -1.0, 1.0, 1.5];
-        let h = vec![1.0, 1.0, 1.0, 1.0];
-        let idxs = (0..4).collect::<Vec<usize>>();
-
-        let mut best_feature_idx = usize::MAX;
-        let mut best_split_score = 0.0;
-        let mut best_threshold = 0.0;
-
-        // Manually calculated expected gain for the best split (on feature 1, with lambda=1.0).
-        // G_left = -1.5, H_left = 2.0
-        // G_right = 2.5, H_right = 2.0
-        // G_total = 1.0, H_total = 4.0
-        // Gain = 0.5 * (G_l^2/(H_l+λ) + G_r^2/(H_r+λ) - G_t^2/(H_t+λ))
-        // Gain = 0.5 * ((-1.5)^2/(2+1) + (2.5)^2/(2+1) - (1.0)^2/(4+1))
-        // Gain = 0.5 * (2.25/3 + 6.25/3 - 1.0/5) = 0.5 * (0.75 + 2.0833 - 0.2) = 1.3166...
-        let expected_gain = 1.3166666666666667;
-
-        // Search both features. The algorithm must find the best split on feature 1.
-        let (_, n_features) = data.shape();
-        for i in 0..n_features {
-            TreeRegressor::<f64, f64, DenseMatrix<f64>, Vec<f64>>::find_best_split(
-                &data,
-                &g,
-                &h,
-                &idxs,
-                i,
-                &mut best_feature_idx,
-                &mut best_split_score,
-                &mut best_threshold,
-                1.0,
-                1.0,
-                0.0,
-            );
-        }
-
-        assert_eq!(best_feature_idx, 1); // Should choose the second feature
-        assert!((best_split_score - expected_gain).abs() < 1e-9);
-        assert_eq!(best_threshold, 25.0); // (20 + 30) / 2
-    }
-
-    /// Tests that the TreeRegressor can build a simple one-level tree on multidimensional data.
-    #[test]
-    fn test_tree_regressor_fit_multidimensional() {
-        let data = vec![
-            vec![1.0, 10.0],
-            vec![1.0, 20.0],
-            vec![1.0, 30.0],
-            vec![1.0, 40.0],
-        ];
-        let data = DenseMatrix::from_2d_vec(&data).unwrap();
-        let g = vec![-0.5, -1.0, 1.0, 1.5];
-        let h = vec![1.0, 1.0, 1.0, 1.0];
-        let idxs = (0..4).collect::<Vec<usize>>();
-
-        let tree = TreeRegressor::<f64, f64, DenseMatrix<f64>, Vec<f64>>::fit(
-            &data, &g, &h, &idxs, 2, 1.0, 1.0, 0.0,
-        );
-
-        // Check that the root node was split on the correct feature
-        assert!(tree.left.is_some());
-        assert!(tree.right.is_some());
-        assert_eq!(tree.split_feature_idx, 1); // Should split on the second feature
-        assert_eq!(tree.threshold, 25.0);
-
-        // Check leaf values (G/H+lambda)
-        // Left leaf: G = -1.5, H = 2.0 => value = -(-1.5)/(2+1) = 0.5
-        // Right leaf: G = 2.5, H = 2.0 => value = -(2.5)/(2+1) = -0.8333
-        assert!((tree.left.unwrap().value - 0.5).abs() < 1e-9);
-        assert!((tree.right.unwrap().value - (-0.833333333)).abs() < 1e-9);
-    }
-
-    /// A "smoke test" to ensure the main XGRegressor can fit and predict on multidimensional data.
-    #[test]
-    fn test_xgregressor_fit_predict_multidimensional() {
-        // Simple 2D data where y is roughly 2*x1 + 3*x2
-        let x_vec = vec![
-            vec![1.0, 1.0],
-            vec![2.0, 1.0],
-            vec![1.0, 2.0],
-            vec![2.0, 2.0],
-        ];
-        let x = DenseMatrix::from_2d_vec(&x_vec).unwrap();
-        let y = vec![5.0, 7.0, 8.0, 10.0];
-
-        let params = XGRegressorParameters::default()
-            .with_n_estimators(10)
-            .with_max_depth(2);
-
-        let fit_result = XGRegressor::fit(&x, &y, params);
-        assert!(
-            fit_result.is_ok(),
-            "Fit failed with error: {:?}",
-            fit_result.err()
-        );
-
-        let model = fit_result.unwrap();
-        let predict_result = model.predict(&x);
-        assert!(
-            predict_result.is_ok(),
-            "Predict failed with error: {:?}",
-            predict_result.err()
-        );
-
-        let predictions = predict_result.unwrap();
-        assert_eq!(predictions.len(), 4);
-    }
-}

From 50537e4b3620c49d80794d3b52311f2eabcee69b Mon Sep 17 00:00:00 2001
From: Daniel Lacina <daniellacina8@gmail.com>
Date: Wed, 9 Jul 2025 13:21:20 -0500
Subject: [PATCH 2/4] implemented extra trees

---
 src/xgboost/mod.rs           |  16 +
 src/xgboost/xgb_regressor.rs | 762 +++++++++++++++++++++++++++++++++++
 2 files changed, 778 insertions(+)
 create mode 100644 src/xgboost/mod.rs
 create mode 100644 src/xgboost/xgb_regressor.rs

diff --git a/src/xgboost/mod.rs b/src/xgboost/mod.rs
new file mode 100644
index 00000000..68e087be
--- /dev/null
+++ b/src/xgboost/mod.rs
@@ -0,0 +1,16 @@
+//! # XGBoost
+//!
+//! XGBoost, which stands for Extreme Gradient Boosting, is a powerful and efficient implementation of the gradient boosting framework. Gradient boosting is a machine learning technique for regression and classification problems, which produces a prediction model in the form of an ensemble of weak prediction models, typically decision trees.
+//!
+//! The core idea of boosting is to build the model in a stage-wise fashion. It learns from its mistakes by sequentially adding new models that correct the errors of the previous ones. Unlike bagging, which trains models in parallel, boosting is a sequential process. Each new tree is fit on a modified version of the original data set, specifically focusing on the instances where the previous models performed poorly.
+//!
+//! XGBoost enhances this process through several key innovations. It employs a more regularized model formalization to control over-fitting, which gives it better performance. It also has a highly optimized and parallelized tree construction process, making it significantly faster and more scalable than traditional gradient boosting implementations.
+//!
+//! ## References:
+//!
+//! * "Elements of Statistical Learning", Hastie T., Tibshirani R., Friedman J., 10. Boosting and Additive Trees
+//! *  XGBoost: A Scalable Tree Boosting System, Chen T., Guestrin C.
+
+// xgboost implementation
+pub mod xgb_regressor;
+pub use xgb_regressor::{XGRegressor, XGRegressorParameters};
diff --git a/src/xgboost/xgb_regressor.rs b/src/xgboost/xgb_regressor.rs
new file mode 100644
index 00000000..ac6ec752
--- /dev/null
+++ b/src/xgboost/xgb_regressor.rs
@@ -0,0 +1,762 @@
+//! # Extreme Gradient Boosting (XGBoost)
+//!
+//! XGBoost is a highly efficient and effective implementation of the gradient boosting framework.
+//! Like other boosting models, it builds an ensemble of sequential decision trees, where each new tree
+//! is trained to correct the errors of the previous ones.
+//!
+//! What makes XGBoost powerful is its use of both the first and second derivatives (gradient and hessian)
+//! of the loss function, which allows for more accurate approximations and faster convergence. It also
+//! includes built-in regularization techniques (L1/`alpha` and L2/`lambda`) to prevent overfitting.
+//!
+//! This implementation was ported to Rust from the concepts and algorithm explained in the blog post
+//! ["XGBoost from Scratch"](https://randomrealizations.com/posts/xgboost-from-scratch/). It is designed
+//! to be a general-purpose regressor that can be used with any objective function that provides a gradient
+//! and a hessian.
+//!
+//! Example:
+//!
+//! ```
+//! use smartcore::linalg::basic::matrix::DenseMatrix;
+//! use smartcore::xgboost::{XGRegressor, XGRegressorParameters};
+//!
+//! // Simple dataset: predict y = 2*x
+//! let x = DenseMatrix::from_2d_array(&[
+//!     &[1.0], &[2.0], &[3.0], &[4.0], &[5.0]
+//! ]).unwrap();
+//! let y = vec![2.0, 4.0, 6.0, 8.0, 10.0];
+//!
+//! // Use default parameters, but set a few for demonstration
+//! let parameters = XGRegressorParameters::default()
+//!     .with_n_estimators(50)
+//!     .with_max_depth(3)
+//!     .with_learning_rate(0.1);
+//!
+//! // Train the model
+//! let model = XGRegressor::fit(&x, &y, parameters).unwrap();
+//!
+//! // Make predictions
+//! let x_test = DenseMatrix::from_2d_array(&[&[6.0], &[7.0]]).unwrap();
+//! let y_hat = model.predict(&x_test).unwrap();
+//!
+//! // y_hat should be close to [12.0, 14.0]
+//! ```
+//!
+
+use rand::{seq::SliceRandom, Rng};
+use std::{iter::zip, marker::PhantomData};
+
+use crate::{
+    api::{PredictorBorrow, SupervisedEstimatorBorrow},
+    error::{Failed, FailedError},
+    linalg::basic::arrays::{Array1, Array2},
+    numbers::basenum::Number,
+    rand_custom::get_rng_impl,
+};
+
+/// Defines the objective function to be optimized.
+/// The objective function provides the loss, gradient (first derivative), and
+/// hessian (second derivative) required for the XGBoost algorithm.
+#[derive(Clone, Debug)]
+pub enum Objective {
+    /// The objective for regression tasks using Mean Squared Error.
+    /// Loss: 0.5 * (y_true - y_pred)^2
+    MeanSquaredError,
+}
+
+impl Objective {
+    /// Calculates the loss for each sample given the true and predicted values.
+    ///
+    /// # Arguments
+    /// * `y_true` - A vector of the true target values.
+    /// * `y_pred` - A vector of the predicted values.
+    ///
+    /// # Returns
+    /// The mean of the calculated loss values.
+    pub fn loss_function<TY: Number, Y: Array1<TY>>(&self, y_true: &Y, y_pred: &Vec<f64>) -> f64 {
+        match self {
+            Objective::MeanSquaredError => {
+                zip(y_true.iterator(0), y_pred)
+                    .map(|(true_val, pred_val)| {
+                        0.5 * (true_val.to_f64().unwrap() - pred_val).powi(2)
+                    })
+                    .sum::<f64>()
+                    / y_true.shape() as f64
+            }
+        }
+    }
+
+    /// Calculates the gradient (first derivative) of the loss function.
+    ///
+    /// # Arguments
+    /// * `y_true` - A vector of the true target values.
+    /// * `y_pred` - A vector of the predicted values.
+    ///
+    /// # Returns
+    /// A vector of gradients for each sample.
+    pub fn gradient<TY: Number, Y: Array1<TY>>(&self, y_true: &Y, y_pred: &Vec<f64>) -> Vec<f64> {
+        match self {
+            Objective::MeanSquaredError => zip(y_true.iterator(0), y_pred)
+                .map(|(true_val, pred_val)| (*pred_val - true_val.to_f64().unwrap()))
+                .collect(),
+        }
+    }
+
+    /// Calculates the hessian (second derivative) of the loss function.
+    ///
+    /// # Arguments
+    /// * `y_true` - A vector of the true target values.
+    /// * `y_pred` - A vector of the predicted values.
+    ///
+    /// # Returns
+    /// A vector of hessians for each sample.
+    #[allow(unused_variables)]
+    pub fn hessian<TY: Number, Y: Array1<TY>>(&self, y_true: &Y, y_pred: &[f64]) -> Vec<f64> {
+        match self {
+            Objective::MeanSquaredError => vec![1.0; y_true.shape()],
+        }
+    }
+}
+
+/// Represents a single decision tree in the XGBoost ensemble.
+///
+/// This is a recursive data structure where each `TreeRegressor` is a node
+/// that can have a left and a right child, also of type `TreeRegressor`.
+#[allow(dead_code)]
+struct TreeRegressor<TX: Number + PartialOrd, TY: Number, X: Array2<TX>, Y: Array1<TY>> {
+    left: Option<Box<TreeRegressor<TX, TY, X, Y>>>,
+    right: Option<Box<TreeRegressor<TX, TY, X, Y>>>,
+    /// The output value of this node. If it's a leaf, this is the final prediction.
+    value: f64,
+    /// The feature value threshold used to split this node.
+    threshold: f64,
+    /// The index of the feature used for splitting.
+    split_feature_idx: usize,
+    /// The gain in score achieved by this split.
+    split_score: f64,
+    _phantom_tx: PhantomData<TX>,
+    _phantom_ty: PhantomData<TY>,
+    _phantom_x: PhantomData<X>,
+    _phantom_y: PhantomData<Y>,
+}
+
+impl<TX: Number + PartialOrd, TY: Number, X: Array2<TX>, Y: Array1<TY>>
+    TreeRegressor<TX, TY, X, Y>
+{
+    /// Recursively builds a decision tree (a `TreeRegressor` node).
+    ///
+    /// This function determines the optimal split for the given set of samples (`idxs`)
+    /// and then recursively calls itself to build the left and right child nodes.
+    ///
+    /// # Arguments
+    /// * `data` - The full training dataset.
+    /// * `g` - Gradients for all samples.
+    /// * `h` - Hessians for all samples.
+    /// * `idxs` - The indices of the samples belonging to the current node.
+    /// * `max_depth` - The maximum remaining depth for this branch.
+    /// * `min_child_weight` - The minimum sum of hessians required in a child node.
+    /// * `lambda` - L2 regularization term on weights.
+    /// * `gamma` - Minimum loss reduction required to make a further partition.
+    pub fn fit(
+        data: &X,
+        g: &Vec<f64>,
+        h: &Vec<f64>,
+        idxs: &[usize],
+        max_depth: u16,
+        min_child_weight: f64,
+        lambda: f64,
+        gamma: f64,
+    ) -> Self {
+        let g_sum = idxs.iter().map(|&i| g[i]).sum::<f64>();
+        let h_sum = idxs.iter().map(|&i| h[i]).sum::<f64>();
+        let value = -g_sum / (h_sum + lambda);
+
+        let mut best_feature_idx = usize::MAX;
+        let mut best_split_score = 0.0;
+        let mut best_threshold = 0.0;
+        let mut left = Option::None;
+        let mut right = Option::None;
+
+        if max_depth > 0 {
+            Self::insert_child_nodes(
+                data,
+                g,
+                h,
+                idxs,
+                &mut best_feature_idx,
+                &mut best_split_score,
+                &mut best_threshold,
+                &mut left,
+                &mut right,
+                max_depth,
+                min_child_weight,
+                lambda,
+                gamma,
+            );
+        }
+
+        Self {
+            left,
+            right,
+            value,
+            threshold: best_threshold,
+            split_feature_idx: best_feature_idx,
+            split_score: best_split_score,
+            _phantom_tx: PhantomData,
+            _phantom_ty: PhantomData,
+            _phantom_x: PhantomData,
+            _phantom_y: PhantomData,
+        }
+    }
+
+    /// Finds the best split and creates child nodes if a valid split is found.
+    fn insert_child_nodes(
+        data: &X,
+        g: &Vec<f64>,
+        h: &Vec<f64>,
+        idxs: &[usize],
+        best_feature_idx: &mut usize,
+        best_split_score: &mut f64,
+        best_threshold: &mut f64,
+        left: &mut Option<Box<Self>>,
+        right: &mut Option<Box<Self>>,
+        max_depth: u16,
+        min_child_weight: f64,
+        lambda: f64,
+        gamma: f64,
+    ) {
+        let (_, n_features) = data.shape();
+        for i in 0..n_features {
+            Self::find_best_split(
+                data,
+                g,
+                h,
+                idxs,
+                i,
+                best_feature_idx,
+                best_split_score,
+                best_threshold,
+                min_child_weight,
+                lambda,
+                gamma,
+            );
+        }
+
+        // A split is only valid if it results in a positive gain.
+        if *best_split_score > 0.0 {
+            let mut left_idxs = Vec::new();
+            let mut right_idxs = Vec::new();
+            for idx in idxs.iter() {
+                if data.get((*idx, *best_feature_idx)).to_f64().unwrap() <= *best_threshold {
+                    left_idxs.push(*idx);
+                } else {
+                    right_idxs.push(*idx);
+                }
+            }
+
+            *left = Some(Box::new(TreeRegressor::fit(
+                data,
+                g,
+                h,
+                &left_idxs,
+                max_depth - 1,
+                min_child_weight,
+                lambda,
+                gamma,
+            )));
+            *right = Some(Box::new(TreeRegressor::fit(
+                data,
+                g,
+                h,
+                &right_idxs,
+                max_depth - 1,
+                min_child_weight,
+                lambda,
+                gamma,
+            )));
+        }
+    }
+
+    /// Iterates through a single feature to find the best possible split point.
+    fn find_best_split(
+        data: &X,
+        g: &[f64],
+        h: &[f64],
+        idxs: &[usize],
+        feature_idx: usize,
+        best_feature_idx: &mut usize,
+        best_split_score: &mut f64,
+        best_threshold: &mut f64,
+        min_child_weight: f64,
+        lambda: f64,
+        gamma: f64,
+    ) {
+        let mut sorted_idxs = idxs.to_owned();
+        sorted_idxs.sort_by(|a, b| {
+            data.get((*a, feature_idx))
+                .partial_cmp(data.get((*b, feature_idx)))
+                .unwrap()
+        });
+
+        let sum_g = sorted_idxs.iter().map(|&i| g[i]).sum::<f64>();
+        let sum_h = sorted_idxs.iter().map(|&i| h[i]).sum::<f64>();
+
+        let mut sum_g_right = sum_g;
+        let mut sum_h_right = sum_h;
+        let mut sum_g_left = 0.0;
+        let mut sum_h_left = 0.0;
+
+        for i in 0..sorted_idxs.len() - 1 {
+            let idx = sorted_idxs[i];
+            let next_idx = sorted_idxs[i + 1];
+
+            let g_i = g[idx];
+            let h_i = h[idx];
+            let x_i = data.get((idx, feature_idx)).to_f64().unwrap();
+            let x_i_next = data.get((next_idx, feature_idx)).to_f64().unwrap();
+
+            sum_g_left += g_i;
+            sum_h_left += h_i;
+            sum_g_right -= g_i;
+            sum_h_right -= h_i;
+
+            if sum_h_left < min_child_weight || x_i == x_i_next {
+                continue;
+            }
+            if sum_h_right < min_child_weight {
+                break;
+            }
+
+            let gain = 0.5
+                * ((sum_g_left * sum_g_left / (sum_h_left + lambda))
+                    + (sum_g_right * sum_g_right / (sum_h_right + lambda))
+                    - (sum_g * sum_g / (sum_h + lambda)))
+                - gamma;
+
+            if gain > *best_split_score {
+                *best_split_score = gain;
+                *best_threshold = (x_i + x_i_next) / 2.0;
+                *best_feature_idx = feature_idx;
+            }
+        }
+    }
+
+    /// Predicts the output values for a dataset.
+    pub fn predict(&self, data: &X) -> Vec<f64> {
+        let (n_samples, n_features) = data.shape();
+        (0..n_samples)
+            .map(|i| {
+                self.predict_for_row(&Vec::from_iterator(
+                    data.get_row(i).iterator(0).copied(),
+                    n_features,
+                ))
+            })
+            .collect()
+    }
+
+    /// Predicts the output value for a single row of data by traversing the tree.
+    pub fn predict_for_row(&self, row: &Vec<TX>) -> f64 {
+        // A leaf node is identified by having no children.
+        if self.left.is_none() {
+            return self.value;
+        }
+
+        // Recurse down the appropriate branch.
+        let child = if row[self.split_feature_idx].to_f64().unwrap() <= self.threshold {
+            self.left.as_ref().unwrap()
+        } else {
+            self.right.as_ref().unwrap()
+        };
+
+        child.predict_for_row(row)
+    }
+}
+
+/// Parameters for the `jRegressor` model.
+///
+/// This struct holds all the hyperparameters that control the training process.
+#[derive(Clone, Debug)]
+pub struct XGRegressorParameters {
+    /// The number of boosting rounds or trees to build.
+    pub n_estimators: usize,
+    /// The maximum depth of each tree.
+    pub max_depth: u16,
+    /// Step size shrinkage used to prevent overfitting.
+    pub learning_rate: f64,
+    /// Minimum sum of instance weight (hessian) needed in a child.
+    pub min_child_weight: usize,
+    /// L2 regularization term on weights.
+    pub lambda: f64,
+    /// Minimum loss reduction required to make a further partition on a leaf node.
+    pub gamma: f64,
+    /// The initial prediction score for all instances.
+    pub base_score: f64,
+    /// The fraction of samples to be used for fitting the individual base learners.
+    pub subsample: f64,
+    /// The seed for the random number generator for reproducibility.
+    pub seed: u64,
+    /// The objective function to be optimized.
+    pub objective: Objective,
+}
+
+impl Default for XGRegressorParameters {
+    /// Creates a new set of `XGRegressorParameters` with default values.
+    fn default() -> Self {
+        Self {
+            n_estimators: 100,
+            learning_rate: 0.3,
+            max_depth: 6,
+            min_child_weight: 1,
+            lambda: 1.0,
+            gamma: 0.0,
+            base_score: 0.5,
+            subsample: 1.0,
+            seed: 0,
+            objective: Objective::MeanSquaredError,
+        }
+    }
+}
+
+// Builder pattern for XGRegressorParameters
+impl XGRegressorParameters {
+    /// Sets the number of boosting rounds or trees to build.
+    pub fn with_n_estimators(mut self, n_estimators: usize) -> Self {
+        self.n_estimators = n_estimators;
+        self
+    }
+
+    /// Sets the step size shrinkage used to prevent overfitting.
+    ///
+    /// Also known as `eta`. A smaller value makes the model more robust by preventing
+    /// too much weight being given to any single tree.
+    pub fn with_learning_rate(mut self, learning_rate: f64) -> Self {
+        self.learning_rate = learning_rate;
+        self
+    }
+
+    /// Sets the maximum depth of each individual tree.
+    // A lower value helps prevent overfitting.*
+    pub fn with_max_depth(mut self, max_depth: u16) -> Self {
+        self.max_depth = max_depth;
+        self
+    }
+
+    /// Sets the minimum sum of instance weight (hessian) needed in a child node.
+    ///
+    /// If the tree partition step results in a leaf node with the sum of
+    // instance weight less than `min_child_weight`, then the building process*
+    /// will give up further partitioning.
+    pub fn with_min_child_weight(mut self, min_child_weight: usize) -> Self {
+        self.min_child_weight = min_child_weight;
+        self
+    }
+
+    /// Sets the L2 regularization term on weights (`lambda`).
+    ///
+    /// Increasing this value will make the model more conservative.
+    pub fn with_lambda(mut self, lambda: f64) -> Self {
+        self.lambda = lambda;
+        self
+    }
+
+    /// Sets the minimum loss reduction required to make a further partition on a leaf node.
+    ///
+    /// The larger `gamma` is, the more conservative the algorithm will be.
+    pub fn with_gamma(mut self, gamma: f64) -> Self {
+        self.gamma = gamma;
+        self
+    }
+
+    /// Sets the initial prediction score for all instances.
+    pub fn with_base_score(mut self, base_score: f64) -> Self {
+        self.base_score = base_score;
+        self
+    }
+
+    /// Sets the fraction of samples to be used for fitting individual base learners.
+    ///
+    /// A value of less than 1.0 introduces randomness and helps prevent overfitting.
+    pub fn with_subsample(mut self, subsample: f64) -> Self {
+        self.subsample = subsample;
+        self
+    }
+
+    /// Sets the seed for the random number generator for reproducibility.
+    pub fn with_seed(mut self, seed: u64) -> Self {
+        self.seed = seed;
+        self
+    }
+
+    /// Sets the objective function to be optimized during training.
+    pub fn with_objective(mut self, objective: Objective) -> Self {
+        self.objective = objective;
+        self
+    }
+}
+
+/// An Extreme Gradient Boosting (XGBoost) model for regression and classification tasks.
+pub struct XGRegressor<TX: Number + PartialOrd, TY: Number, X: Array2<TX>, Y: Array1<TY>> {
+    regressors: Option<Vec<TreeRegressor<TX, TY, X, Y>>>,
+    parameters: Option<XGRegressorParameters>,
+    _phantom_ty: PhantomData<TY>,
+    _phantom_tx: PhantomData<TX>,
+    _phantom_y: PhantomData<Y>,
+    _phantom_x: PhantomData<X>,
+}
+
+impl<TX: Number + PartialOrd, TY: Number, X: Array2<TX>, Y: Array1<TY>> XGRegressor<TX, TY, X, Y> {
+    /// Fits the XGBoost model to the training data.
+    pub fn fit(data: &X, y: &Y, parameters: XGRegressorParameters) -> Result<Self, Failed> {
+        if parameters.subsample > 1.0 || parameters.subsample <= 0.0 {
+            return Err(Failed::because(
+                FailedError::ParametersError,
+                "Subsample ratio must be in (0, 1].",
+            ));
+        }
+
+        let (n_samples, _) = data.shape();
+        let learning_rate = parameters.learning_rate;
+        let mut predictions = vec![parameters.base_score; n_samples];
+
+        let mut regressors = Vec::new();
+        let mut rng = get_rng_impl(Some(parameters.seed));
+
+        for _ in 0..parameters.n_estimators {
+            let gradients = parameters.objective.gradient(y, &predictions);
+            let hessians = parameters.objective.hessian(y, &predictions);
+
+            let sample_idxs = if parameters.subsample < 1.0 {
+                Self::sample_without_replacement(n_samples, parameters.subsample, &mut rng)
+            } else {
+                (0..n_samples).collect::<Vec<usize>>()
+            };
+
+            let regressor = TreeRegressor::fit(
+                data,
+                &gradients,
+                &hessians,
+                &sample_idxs,
+                parameters.max_depth,
+                parameters.min_child_weight as f64,
+                parameters.lambda,
+                parameters.gamma,
+            );
+
+            let corrections = regressor.predict(data);
+            predictions = zip(predictions, corrections)
+                .map(|(pred, correction)| pred + (learning_rate * correction))
+                .collect();
+
+            regressors.push(regressor);
+        }
+
+        Ok(Self {
+            regressors: Some(regressors),
+            parameters: Some(parameters),
+            _phantom_ty: PhantomData,
+            _phantom_y: PhantomData,
+            _phantom_tx: PhantomData,
+            _phantom_x: PhantomData,
+        })
+    }
+
+    /// Predicts target values for the given input data.
+    pub fn predict(&self, data: &X) -> Result<Vec<TX>, Failed> {
+        let (n_samples, _) = data.shape();
+
+        let parameters = self.parameters.as_ref().unwrap();
+        let mut predictions = vec![parameters.base_score; n_samples];
+        let regressors = self.regressors.as_ref().unwrap();
+
+        for regressor in regressors.iter() {
+            let corrections = regressor.predict(data);
+            predictions = zip(predictions, corrections)
+                .map(|(pred, correction)| pred + (parameters.learning_rate * correction))
+                .collect();
+        }
+
+        Ok(predictions
+            .into_iter()
+            .map(|p| TX::from_f64(p).unwrap())
+            .collect())
+    }
+
+    /// Creates a random sample of indices without replacement.
+    fn sample_without_replacement(
+        population_size: usize,
+        subsample_ratio: f64,
+        rng: &mut impl Rng,
+    ) -> Vec<usize> {
+        let mut indices: Vec<usize> = (0..population_size).collect();
+        indices.shuffle(rng);
+        indices.truncate((population_size as f64 * subsample_ratio) as usize);
+        indices
+    }
+}
+
+// Boilerplate implementation for the smartcore traits
+impl<TX: Number + PartialOrd, TY: Number, X: Array2<TX>, Y: Array1<TY>>
+    SupervisedEstimatorBorrow<'_, X, Y, XGRegressorParameters> for XGRegressor<TX, TY, X, Y>
+{
+    fn new() -> Self {
+        Self {
+            regressors: None,
+            parameters: None,
+            _phantom_ty: PhantomData,
+            _phantom_y: PhantomData,
+            _phantom_tx: PhantomData,
+            _phantom_x: PhantomData,
+        }
+    }
+
+    fn fit(x: &X, y: &Y, parameters: &XGRegressorParameters) -> Result<Self, Failed> {
+        XGRegressor::fit(x, y, parameters.clone())
+    }
+}
+
+impl<TX: Number + PartialOrd, TY: Number, X: Array2<TX>, Y: Array1<TY>> PredictorBorrow<'_, X, TX>
+    for XGRegressor<TX, TY, X, Y>
+{
+    fn predict(&self, x: &X) -> Result<Vec<TX>, Failed> {
+        self.predict(x)
+    }
+}
+
+// ------------------- TESTS -------------------
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::linalg::basic::{arrays::Array, matrix::DenseMatrix};
+
+    /// Tests the gradient and hessian calculations for MeanSquaredError.
+    #[test]
+    fn test_mse_objective() {
+        let objective = Objective::MeanSquaredError;
+        let y_true = vec![1.0, 2.0, 3.0];
+        let y_pred = vec![1.5, 2.5, 2.5];
+
+        let gradients = objective.gradient(&y_true, &y_pred);
+        let hessians = objective.hessian(&y_true, &y_pred);
+
+        // Gradients should be (pred - true)
+        assert_eq!(gradients, vec![0.5, 0.5, -0.5]);
+        // Hessians should be all 1.0 for MSE
+        assert_eq!(hessians, vec![1.0, 1.0, 1.0]);
+    }
+
+    #[test]
+    fn test_find_best_split_multidimensional() {
+        // Data has two features. The second feature is a better predictor.
+        let data = vec![
+            vec![1.0, 10.0], // g = -0.5
+            vec![1.0, 20.0], // g = -1.0
+            vec![1.0, 30.0], // g = 1.0
+            vec![1.0, 40.0], // g = 1.5
+        ];
+        let data = DenseMatrix::from_2d_vec(&data).unwrap();
+        let g = vec![-0.5, -1.0, 1.0, 1.5];
+        let h = vec![1.0, 1.0, 1.0, 1.0];
+        let idxs = (0..4).collect::<Vec<usize>>();
+
+        let mut best_feature_idx = usize::MAX;
+        let mut best_split_score = 0.0;
+        let mut best_threshold = 0.0;
+
+        // Manually calculated expected gain for the best split (on feature 1, with lambda=1.0).
+        // G_left = -1.5, H_left = 2.0
+        // G_right = 2.5, H_right = 2.0
+        // G_total = 1.0, H_total = 4.0
+        // Gain = 0.5 * (G_l^2/(H_l+λ) + G_r^2/(H_r+λ) - G_t^2/(H_t+λ))
+        // Gain = 0.5 * ((-1.5)^2/(2+1) + (2.5)^2/(2+1) - (1.0)^2/(4+1))
+        // Gain = 0.5 * (2.25/3 + 6.25/3 - 1.0/5) = 0.5 * (0.75 + 2.0833 - 0.2) = 1.3166...
+        let expected_gain = 1.3166666666666667;
+
+        // Search both features. The algorithm must find the best split on feature 1.
+        let (_, n_features) = data.shape();
+        for i in 0..n_features {
+            TreeRegressor::<f64, f64, DenseMatrix<f64>, Vec<f64>>::find_best_split(
+                &data,
+                &g,
+                &h,
+                &idxs,
+                i,
+                &mut best_feature_idx,
+                &mut best_split_score,
+                &mut best_threshold,
+                1.0,
+                1.0,
+                0.0,
+            );
+        }
+
+        assert_eq!(best_feature_idx, 1); // Should choose the second feature
+        assert!((best_split_score - expected_gain).abs() < 1e-9);
+        assert_eq!(best_threshold, 25.0); // (20 + 30) / 2
+    }
+
+    /// Tests that the TreeRegressor can build a simple one-level tree on multidimensional data.
+    #[test]
+    fn test_tree_regressor_fit_multidimensional() {
+        let data = vec![
+            vec![1.0, 10.0],
+            vec![1.0, 20.0],
+            vec![1.0, 30.0],
+            vec![1.0, 40.0],
+        ];
+        let data = DenseMatrix::from_2d_vec(&data).unwrap();
+        let g = vec![-0.5, -1.0, 1.0, 1.5];
+        let h = vec![1.0, 1.0, 1.0, 1.0];
+        let idxs = (0..4).collect::<Vec<usize>>();
+
+        let tree = TreeRegressor::<f64, f64, DenseMatrix<f64>, Vec<f64>>::fit(
+            &data, &g, &h, &idxs, 2, 1.0, 1.0, 0.0,
+        );
+
+        // Check that the root node was split on the correct feature
+        assert!(tree.left.is_some());
+        assert!(tree.right.is_some());
+        assert_eq!(tree.split_feature_idx, 1); // Should split on the second feature
+        assert_eq!(tree.threshold, 25.0);
+
+        // Check leaf values (G/H+lambda)
+        // Left leaf: G = -1.5, H = 2.0 => value = -(-1.5)/(2+1) = 0.5
+        // Right leaf: G = 2.5, H = 2.0 => value = -(2.5)/(2+1) = -0.8333
+        assert!((tree.left.unwrap().value - 0.5).abs() < 1e-9);
+        assert!((tree.right.unwrap().value - (-0.833333333)).abs() < 1e-9);
+    }
+
+    /// A "smoke test" to ensure the main XGRegressor can fit and predict on multidimensional data.
+    #[test]
+    fn test_xgregressor_fit_predict_multidimensional() {
+        // Simple 2D data where y is roughly 2*x1 + 3*x2
+        let x_vec = vec![
+            vec![1.0, 1.0],
+            vec![2.0, 1.0],
+            vec![1.0, 2.0],
+            vec![2.0, 2.0],
+        ];
+        let x = DenseMatrix::from_2d_vec(&x_vec).unwrap();
+        let y = vec![5.0, 7.0, 8.0, 10.0];
+
+        let params = XGRegressorParameters::default()
+            .with_n_estimators(10)
+            .with_max_depth(2);
+
+        let fit_result = XGRegressor::fit(&x, &y, params);
+        assert!(
+            fit_result.is_ok(),
+            "Fit failed with error: {:?}",
+            fit_result.err()
+        );
+
+        let model = fit_result.unwrap();
+        let predict_result = model.predict(&x);
+        assert!(
+            predict_result.is_ok(),
+            "Predict failed with error: {:?}",
+            predict_result.err()
+        );
+
+        let predictions = predict_result.unwrap();
+        assert_eq!(predictions.len(), 4);
+    }
+}

From 35bf52b5e4012ceeea3c5fcdfdf2667b32aa6a4c Mon Sep 17 00:00:00 2001
From: Daniel Lacina <daniellacina8@gmail.com>
Date: Wed, 9 Jul 2025 13:23:13 -0500
Subject: [PATCH 3/4] fixed unecessary changes

---
 Cargo.toml | 2 --
 1 file changed, 2 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index e48b5d0c..3c1b8ab9 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -28,8 +28,6 @@ num = "0.4"
 rand = { version = "0.8.5", default-features = false, features = ["small_rng"] }
 rand_distr = { version = "0.4", optional = true }
 serde = { version = "1", features = ["derive"], optional = true }
-keyed_priority_queue = "0.4.2"
-ordered-float = "5.0.0"
 
 [target.'cfg(not(target_arch = "wasm32"))'.dependencies]
 typetag = { version = "0.2", optional = true }

From 50ac3976b39d993e6035bba15bedefba03407e2c Mon Sep 17 00:00:00 2001
From: Daniel Lacina <daniellacina8@gmail.com>
Date: Wed, 9 Jul 2025 13:24:35 -0500
Subject: [PATCH 4/4] fixed unecessary changes

---
 src/tree/decision_tree_regressor.rs | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/tree/decision_tree_regressor.rs b/src/tree/decision_tree_regressor.rs
index af4e2748..0f574402 100644
--- a/src/tree/decision_tree_regressor.rs
+++ b/src/tree/decision_tree_regressor.rs
@@ -584,7 +584,6 @@ impl<TX: Number + PartialOrd, TY: Number, X: Array2<TX>, Y: Array1<TY>>
         use_random_splits: bool,
     ) {
         if use_random_splits {
-            // --- EFFICIENT LOGIC for Continuous Random Splits ---
             let (min_val, max_val) = {
                 let mut min_opt = None;
                 let mut max_opt = None;