Python/machine_learning/decision_tree.py at 299acbc64a1ddbf5747241b6f102a61ab0c6a4f9 · TheAlgorithms/Python · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
"""
Implementation of a basic regression decision tree.
Input data set: The input data set must be 1-dimensional with continuous labels.
Output: The decision tree maps a real number input to a real number output.
"""

from collections import Counter

import numpy as np


class DecisionTree:
    def __init__(self, depth=5, min_leaf_size=5, task="regression", criterion="gini"):
        self.depth = depth
        self.decision_boundary = 0
        self.left = None
        self.right = None
        self.min_leaf_size = min_leaf_size
        self.prediction = None
        self.task = task
        self.criterion = criterion

    def mean_squared_error(self, labels, prediction):
        """
        mean_squared_error:
        @param labels: a one-dimensional numpy array
        @param prediction: a floating point value
        return value: mean_squared_error calculates the error if prediction is used to
            estimate the labels
        >>> tester = DecisionTree()
        >>> test_labels = np.array([1,2,3,4,5,6,7,8,9,10])
        >>> test_prediction = float(6)
        >>> bool(tester.mean_squared_error(test_labels, test_prediction) == (
        ...     TestDecisionTree.helper_mean_squared_error_test(test_labels,
        ...         test_prediction)))
        True
        >>> test_labels = np.array([1,2,3])
        >>> test_prediction = float(2)
        >>> bool(tester.mean_squared_error(test_labels, test_prediction) == (
        ...     TestDecisionTree.helper_mean_squared_error_test(test_labels,
        ...         test_prediction)))
        True
        """
        if labels.ndim != 1:
            raise ValueError("Input labels must be one dimensional")
        return np.mean((labels - prediction) ** 2)

    def gini(self, y):
        """
        Computes the Gini impurity for a set of labels.
        Gini impurity measures how often a randomly chosen element
        would be incorrectly classified.
        Formula: Gini = 1 - sum(p_i^2)
        where p_i is the probability of class i.

        Lower Gini value indicates better purity (best split).
        """
        _, counts = np.unique(y, return_counts=True)
        prob = counts / counts.sum()
        return 1 - np.sum(prob**2)

    def entropy(self, y):
        """
        Computes the entropy (impurity) of a set of labels.
        Entropy measures the randomness or disorder in the data.
        Formula: Entropy = - sum(p_i * log2(p_i))
        where p_i is the probability of class i.

        Lower entropy means higher purity.
        """
        _, counts = np.unique(y, return_counts=True)
        prob = counts / counts.sum()
        return -np.sum(prob * np.log2(prob + 1e-9))

    def information_gain(self, parent, left, right):
        """
        Computes the information gain from splitting a dataset.
        Information gain represents the reduction in impurity
        after a dataset is split into left and right subsets.
        Formula: IG = Impurity(parent) - [
            weighted impurity(left) + weighted impurity(right)]

        Higher information gain indicates a better split.
        """
        if self.criterion == "gini":
            func = self.gini
        elif self.criterion == "entropy":
            func = self.entropy
        else:
            raise ValueError("Invalid criterion")

        weight_l = len(left) / len(parent)
        weight_r = len(right) / len(parent)

        return func(parent) - (weight_l * func(left) + weight_r * func(right))

    def most_common_label(self, y):
        return Counter(y).most_common(1)[0][0]

    def train(self, x, y):
        """
        train:
        @param x: a one-dimensional numpy array
        @param y: a one-dimensional numpy array.
        The contents of y are the labels for the corresponding X values

        train() does not have a return value

        Examples:
        1. Try to train when x & y are of same length & 1 dimensions (No errors)
        >>> dt = DecisionTree()
        >>> dt.train(np.array([10,20,30,40,50]),np.array([0,0,0,1,1]))

        2. Try to train when x is 2 dimensions
        >>> dt = DecisionTree()
        >>> dt.train(np.array([[1,2,3,4,5],[1,2,3,4,5]]),np.array([0,0,0,1,1]))
        Traceback (most recent call last):
            ...
        ValueError: Input data set must be one-dimensional

        3. Try to train when x and y are not of the same length
        >>> dt = DecisionTree()
        >>> dt.train(np.array([1,2,3,4,5]),np.array([[0,0,0,1,1],[0,0,0,1,1]]))
        Traceback (most recent call last):
            ...
        ValueError: x and y have different lengths

        4. Try to train when x & y are of the same length but different dimensions
        >>> dt = DecisionTree()
        >>> dt.train(np.array([1,2,3,4,5]),np.array([[1],[2],[3],[4],[5]]))
        Traceback (most recent call last):
            ...
        ValueError: Data set labels must be one-dimensional

        This section is to check that the inputs conform to our dimensionality
        constraints
        """
        if x.ndim != 1:
            raise ValueError("Input data set must be one-dimensional")
        if len(x) != len(y):
            raise ValueError("x and y have different lengths")
        if y.ndim != 1:
            raise ValueError("Data set labels must be one-dimensional")

        if len(x) < 2 * self.min_leaf_size or self.depth == 1:
            if self.task == "regression":
                self.prediction = np.mean(y)
            else:
                self.prediction = self.most_common_label(y)
            return

        best_split = 0

        """
        loop over all possible splits for the decision tree. find the best split.
        if no split exists that is less than 2 * error for the entire array
        then the data set is not split and the average for the entire array is used as
        the predictor
        """
        best_score = float("inf") if self.task == "regression" else -float("inf")

        for i in range(len(x)):
            if len(x[:i]) < self.min_leaf_size:
                continue
            if len(x[i:]) < self.min_leaf_size:
                continue

            left_y = y[:i]
            right_y = y[i:]

            if self.task == "regression":
                error_left = self.mean_squared_error(left_y, np.mean(left_y))
                error_right = self.mean_squared_error(right_y, np.mean(right_y))
                score = error_left + error_right

                if score < best_score:
                    best_score = score
                    best_split = i

            else:
                gain = self.information_gain(y, left_y, right_y)

                if gain > best_score:
                    best_score = gain
                    best_split = i

        if best_split != 0:
            left_x = x[:best_split]
            left_y = y[:best_split]
            right_x = x[best_split:]
            right_y = y[best_split:]

            self.decision_boundary = x[best_split]

            self.left = DecisionTree(
                depth=self.depth - 1,
                min_leaf_size=self.min_leaf_size,
                task=self.task,
                criterion=self.criterion,
            )
            self.right = DecisionTree(
                depth=self.depth - 1,
                min_leaf_size=self.min_leaf_size,
                task=self.task,
                criterion=self.criterion,
            )

            self.left.train(left_x, left_y)
            self.right.train(right_x, right_y)

        elif self.task == "regression":
            self.prediction = np.mean(y)
        else:
            self.prediction = self.most_common_label(y)

    def predict(self, x):
        """
        predict:
        @param x: a floating point value to predict the label of
        the prediction function works by recursively calling the predict function
        of the appropriate subtrees based on the tree's decision boundary
        """
        if self.prediction is not None:
            return self.prediction
        if self.left is not None and self.right is not None:
            if x >= self.decision_boundary:
                return self.right.predict(x)
            else:
                return self.left.predict(x)

        raise ValueError("Decision tree not yet trained")


class TestDecisionTree:
    """Decision Tres test class"""

    @staticmethod
    def helper_mean_squared_error_test(labels, prediction):
        """
        helper_mean_squared_error_test:
        @param labels: a one dimensional numpy array
        @param prediction: a floating point value
        return value: helper_mean_squared_error_test calculates the mean squared error
        """
        squared_error_sum = float(0)
        for label in labels:
            squared_error_sum += (label - prediction) ** 2

        return float(squared_error_sum / labels.size)


def main():
    """
    In this demonstration we're generating a sample data set from the sin function in
    numpy.  We then train a decision tree on the data set and use the decision tree to
    predict the label of 10 different test values. Then the mean squared error over
    this test is displayed.
    """
    x = np.arange(-1.0, 1.0, 0.005)
    y = np.sin(x)

    tree = DecisionTree(depth=10, min_leaf_size=10, task="regression")
    tree.train(x, y)

    print("Regression prediction:", tree.predict(0.5))
    x_cls = np.array([1, 2, 3, 4, 5, 6])
    y_cls = np.array([0, 0, 0, 1, 1, 1])

    clf = DecisionTree(
        depth=3, min_leaf_size=1, task="classification", criterion="gini"
    )
    clf.train(x_cls, y_cls)

    print("Classification prediction (2):", clf.predict(2))
    print("Classification prediction (5):", clf.predict(5))


if __name__ == "__main__":
    main()
    import doctest

    doctest.testmod(name="mean_squared_error", verbose=True)