ML-From-Scratch/mlfromscratch/supervised_learning/random_forest.py at master · kashu98/ML-From-Scratch · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
from __future__ import division, print_function
import numpy as np
import math
import progressbar

# Import helper functions
from mlfromscratch.utils import divide_on_feature, train_test_split, get_random_subsets, normalize
from mlfromscratch.utils import accuracy_score, calculate_entropy
from mlfromscratch.unsupervised_learning import PCA
from mlfromscratch.supervised_learning import ClassificationTree
from mlfromscratch.utils.misc import bar_widgets
from mlfromscratch.utils import Plot


class RandomForest():
    """Random Forest classifier. Uses a collection of classification trees that
    trains on random subsets of the data using a random subsets of the features.

    Parameters:
    -----------
    n_estimators: int
        The number of classification trees that are used.
    max_features: int
        The maximum number of features that the classification trees are allowed to
        use.
    min_samples_split: int
        The minimum number of samples needed to make a split when building a tree.
    min_gain: float
        The minimum impurity required to split the tree further.
    max_depth: int
        The maximum depth of a tree.
    debug: boolean
        True or false depending on if we wish to display the training errors.
    """
    def __init__(self, n_estimators=100, max_features=None, min_samples_split=2,
                 min_gain=1e-7, max_depth=float("inf"), debug=False):
        self.n_estimators = n_estimators    # Number of trees
        self.max_features = max_features    # Maxmimum number of features per tree
        self.feature_indices = []           # The indices of the features used for each tree
        self.min_samples_split = min_samples_split
        self.min_gain = min_gain            # Minimum information gain req. to continue
        self.max_depth = max_depth          # Maximum depth for tree
        self.debug = debug
        self.bar = progressbar.ProgressBar(widgets=bar_widgets)

        # Initialize decision trees
        self.trees = []
        for _ in range(n_estimators):
            self.trees.append(
                ClassificationTree(
                    min_samples_split=self.min_samples_split,
                    min_impurity=min_gain,
                    max_depth=self.max_depth))

    def fit(self, X, y):
        n_features = np.shape(X)[1]
        # If max_features have not been defined => select it as
        # sqrt(n_features)
        if not self.max_features:
            self.max_features = int(math.sqrt(n_features))

        if self.debug:
            print ("Training (%s estimators):" % (self.n_estimators))
        # Choose one random subset of the data for each tree
        subsets = get_random_subsets(X, y, self.n_estimators)

        for i in self.bar(range(self.n_estimators)):
            X_subset, y_subset = subsets[i]
            # Feature bagging (select random subsets of the features)
            idx = np.random.choice(range(n_features), size=self.max_features, replace=True)
            # Save the indices of the features for prediction
            self.feature_indices.append(idx)
            # Choose the features corresponding to the indices
            X_subset = X_subset[:, idx]
            # Fit the tree to the data
            self.trees[i].fit(X_subset, y_subset)

            if self.debug:
                progress = 100 * (i / self.n_estimators)
                print ("Progress: %.2f%%" % progress)

    def predict(self, X):
        y_preds = []
        # Let each tree make a prediction on the data
        for i, tree in enumerate(self.trees):
            # Select the features that the tree has trained on
            idx = self.feature_indices[i]
            # Make a prediction based on those features
            prediction = tree.predict(X[:, idx])
            y_preds.append(prediction)

        # Take the transpose of the matrix to transform it so
        # that rows are samples and columns are predictions by the
        # estimators
        y_preds = np.array(y_preds).T
        y_pred = []
        # For each sample
        for sample_predictions in y_preds:
            # Do a majority vote over the predictions (columns)
            max_count = 0
            most_common = None
            # For each unique predicted label -> count occurences
            # and save the most predicted label
            for label in np.unique(sample_predictions):
                count = len(sample_predictions[sample_predictions == label])
                if count > max_count:
                    max_count = count
                    most_common = label
            # The most common prediction gets added as final prediction
            # of the sample
            y_pred.append(most_common)
        return y_pred