forked from eriklindernoren/ML-From-Scratch
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathrandom_forest.py
More file actions
112 lines (101 loc) · 4.66 KB
/
random_forest.py
File metadata and controls
112 lines (101 loc) · 4.66 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
from __future__ import division, print_function
import numpy as np
import math
import progressbar
# Import helper functions
from mlfromscratch.utils import divide_on_feature, train_test_split, get_random_subsets, normalize
from mlfromscratch.utils import accuracy_score, calculate_entropy
from mlfromscratch.unsupervised_learning import PCA
from mlfromscratch.supervised_learning import ClassificationTree
from mlfromscratch.utils.misc import bar_widgets
from mlfromscratch.utils import Plot
class RandomForest():
"""Random Forest classifier. Uses a collection of classification trees that
trains on random subsets of the data using a random subsets of the features.
Parameters:
-----------
n_estimators: int
The number of classification trees that are used.
max_features: int
The maximum number of features that the classification trees are allowed to
use.
min_samples_split: int
The minimum number of samples needed to make a split when building a tree.
min_gain: float
The minimum impurity required to split the tree further.
max_depth: int
The maximum depth of a tree.
debug: boolean
True or false depending on if we wish to display the training errors.
"""
def __init__(self, n_estimators=100, max_features=None, min_samples_split=2,
min_gain=1e-7, max_depth=float("inf"), debug=False):
self.n_estimators = n_estimators # Number of trees
self.max_features = max_features # Maxmimum number of features per tree
self.feature_indices = [] # The indices of the features used for each tree
self.min_samples_split = min_samples_split
self.min_gain = min_gain # Minimum information gain req. to continue
self.max_depth = max_depth # Maximum depth for tree
self.debug = debug
self.bar = progressbar.ProgressBar(widgets=bar_widgets)
# Initialize decision trees
self.trees = []
for _ in range(n_estimators):
self.trees.append(
ClassificationTree(
min_samples_split=self.min_samples_split,
min_impurity=min_gain,
max_depth=self.max_depth))
def fit(self, X, y):
n_features = np.shape(X)[1]
# If max_features have not been defined => select it as
# sqrt(n_features)
if not self.max_features:
self.max_features = int(math.sqrt(n_features))
if self.debug:
print ("Training (%s estimators):" % (self.n_estimators))
# Choose one random subset of the data for each tree
subsets = get_random_subsets(X, y, self.n_estimators)
for i in self.bar(range(self.n_estimators)):
X_subset, y_subset = subsets[i]
# Feature bagging (select random subsets of the features)
idx = np.random.choice(range(n_features), size=self.max_features, replace=True)
# Save the indices of the features for prediction
self.feature_indices.append(idx)
# Choose the features corresponding to the indices
X_subset = X_subset[:, idx]
# Fit the tree to the data
self.trees[i].fit(X_subset, y_subset)
if self.debug:
progress = 100 * (i / self.n_estimators)
print ("Progress: %.2f%%" % progress)
def predict(self, X):
y_preds = []
# Let each tree make a prediction on the data
for i, tree in enumerate(self.trees):
# Select the features that the tree has trained on
idx = self.feature_indices[i]
# Make a prediction based on those features
prediction = tree.predict(X[:, idx])
y_preds.append(prediction)
# Take the transpose of the matrix to transform it so
# that rows are samples and columns are predictions by the
# estimators
y_preds = np.array(y_preds).T
y_pred = []
# For each sample
for sample_predictions in y_preds:
# Do a majority vote over the predictions (columns)
max_count = 0
most_common = None
# For each unique predicted label -> count occurences
# and save the most predicted label
for label in np.unique(sample_predictions):
count = len(sample_predictions[sample_predictions == label])
if count > max_count:
max_count = count
most_common = label
# The most common prediction gets added as final prediction
# of the sample
y_pred.append(most_common)
return y_pred