Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 13 additions & 9 deletions mlfromscratch/supervised_learning/naive_bayes.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,13 @@
from mlfromscratch.utils import Plot, accuracy_score

class NaiveBayes():
"""The Gaussian Naive Bayes classifier. """

def __init__(self):
"""The Gaussian Naive Bayes classifier. """
self.eps = 1e-30 # Added in denominator to prevent division by zero

def fit(self, X, y):
"""Fit the model to a Dataset. """
self.X, self.y = X, y
self.classes = np.unique(y)
self.parameters = []
Expand All @@ -22,9 +27,8 @@ def fit(self, X, y):

def _calculate_likelihood(self, mean, var, x):
""" Gaussian likelihood of the data x given mean and var """
eps = 1e-4 # Added in denominator to prevent division by zero
coeff = 1.0 / math.sqrt(2.0 * math.pi * var + eps)
exponent = math.exp(-(math.pow(x - mean, 2) / (2 * var + eps)))
coeff = 1.0 / math.sqrt(2.0 * math.pi * var + self.eps)
exponent = math.exp(-(math.pow(x - mean, 2) / (2 * var + self.eps)))
return coeff * exponent

def _calculate_prior(self, c):
Expand All @@ -36,7 +40,7 @@ def _calculate_prior(self, c):
def _classify(self, sample):
""" Classification using Bayes Rule P(Y|X) = P(X|Y)*P(Y)/P(X),
or Posterior = Likelihood * Prior / Scaling Factor

P(Y|X) - The posterior is the probability that sample x is of class y given the
feature values of x being distributed according to distribution of y and the prior.
P(X|Y) - Likelihood of data X given class distribution Y.
Expand All @@ -45,21 +49,21 @@ def _classify(self, sample):
P(X) - Scales the posterior to make it a proper probability distribution.
This term is ignored in this implementation since it doesn't affect
which class distribution the sample is most likely to belong to.

Classifies the sample as the class that results in the largest P(Y|X) (posterior)
"""
"""
posteriors = []
# Go through list of classes
for i, c in enumerate(self.classes):
# Initialize posterior as prior
posterior = self._calculate_prior(c)
posterior = np.log(self._calculate_prior(c))
# Naive assumption (independence):
# P(x1,x2,x3|Y) = P(x1|Y)*P(x2|Y)*P(x3|Y)
# Posterior is product of prior and likelihoods (ignoring scaling factor)
for feature_value, params in zip(sample, self.parameters[i]):
# Likelihood of feature value given distribution of feature values given y
likelihood = self._calculate_likelihood(params["mean"], params["var"], feature_value)
posterior *= likelihood
# Calculate Loglikelihood to prevent overflowing in multiplications.
posterior += np.log(likelihood + self.eps)
posteriors.append(posterior)
# Return the class with the largest posterior probability
return self.classes[np.argmax(posteriors)]
Expand Down