1. 贝叶斯定理
如果有两个事件,事件
依据式(1)能够推出贝叶斯定理为
给定一个全集
则广义的贝叶斯定理有
2. 朴素贝叶斯基本原理
给定一组训练数据集
如果如今给定一个新的样本
那怎样求解出这些后验概率呢?依据贝叶斯定理。有
一般地,朴素贝叶斯方法如果各个特征之间是相互独立的,则式(5)能够写成:
由于(6)式的分母。对于每个
以下,是怎样通过样本对
3. 朴素贝叶斯法的參数预计
3.1 极大似然预计
在朴素贝叶斯法中,学习就是意味着预计先验概率
当中,预计先验概率和条件概率的方法有非常多,比方极大似然预计,多项式。高斯。伯努利等。
当中,在极大似然预计中,先验概率
如果输入样本的第
样例1
该样例来自李航的《统计学习方法》。
表中
试求。
数据例如以下所看到的。当中,特征
import numpy as np
import pandas as pd
x1 = np.array([1,1,1,1,1,2,2,2,2,2,3,3,3,3,3])
x2 = np.array([0,1,1,0,0,0,1,1,2,2,2,1,1,2,2])
y = np.array([-1,-1,1,1,-1,-1,-1,1,1,1,1,1,1,1,-1])
dataSet = np.concatenate((x1[:,None],x2[:,None],y[:,None]),axis=1)
df = pd.DataFrame(dataSet,index=np.arange(1,16,1),columns=['X1','X2','y'])
df.T
1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
X1 | 1 | 1 | 1 | 1 | 1 | 2 | 2 | 2 | 2 | 2 | 3 | 3 | 3 | 3 | 3 |
X2 | 0 | 1 | 1 | 0 | 0 | 0 | 1 | 1 | 2 | 2 | 2 | 1 | 1 | 2 | 2 |
y | -1 | -1 | 1 | 1 | -1 | -1 | -1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | -1 |
求解
step1: 求解先验概率
step2 求解条件概率
(2.1) 特征
(2.2) 特征
step3 求解后验概率
由于
例如以下是python的极大似然预计的朴素贝叶斯代码,代码执行结果跟求解一致。
class MLENB:
"""
Maximum likelihood estimation Naive Bayes
Attributes
----------
class_prior_ : array, shape (n_classes, )
Smoothed empirical probability for each class.
class_count_: array, shape (n_classes,)
number of training samples observed in each class.
MLE_: array, shape(n_classes, n_features)
Maximum likelihood estimation of each feature per class, each of element is a dict
"""
def __init__(self):
pass
def fit(self,X,y):
"""Fit maximum likelihood estimation Naive Bayes according to X, y
Parameters
----------
X : array-like, shape (n_samples, n_features)
Training vectors, where n_samples is the number of samples
and n_features is the number of features.
y : array-like, shape (n_samples,)
Target values.
Returns
-------
self : object
Returns self.
"""
n_samples = X.shape[0]
n_features = X.shape[1]
n_classes = len(set(y))
self.class_count_ = np.empty(n_classes)
self.class_prior_ = np.empty(n_classes)
self.MLE_ = np.empty((n_classes,n_features),dtype=dict)
self.target_unique = np.unique(y)
for i in range(n_classes):
dataX_tu = X[y == self.target_unique[i]]
self.class_prior_[i] = dataX_tu.shape[0] / float(len(y))
self.class_count_[i] = dataX_tu.shape[0]
for j in range(n_features):
feature = dataX_tu[:,j]
feature_unique = np.unique(feature)
fp = {}
for f_item in feature_unique:
fp[f_item] = list(feature).count(f_item) / float(len(feature))
self.MLE_[i,j] = fp
return self
def __predict_likelihood(self,x):
if x.ndim == 1:
x = np.array([x])
n_samples = x.shape[0]
n_features = x.shape[1]
n_classes = len(self.class_count_)
likelihood = []
for x_item in x:
class_p = []
for i in range(n_classes):
p = self.class_prior_[i]
for j in range(n_features):
if x_item[j] in self.MLE_[i,j]:
p *= self.MLE_[i,j][x_item[j]]
else:
p *= 0
class_p.append(p)
likelihood.append(class_p)
return np.array(likelihood)
def predict(self,x):
"""Perform classification on an array of test vectors X.
Parameters
----------
X : array-like, shape = [n_samples, n_features]
Returns
-------
C : array, shape = [n_samples]
Predicted target values for X
"""
likelihood = self.__predict_likelihood(x)
max_index = np.argmax(likelihood, axis=1)
return np.array([self.target_unique[i] for i in max_index])
def predict_proba(self,x):
"""
Return probability estimates for the test vector X.
Parameters
----------
X : array-like, shape = [n_samples, n_features]
Returns
-------
C : array-like, shape = [n_samples, n_classes]
Returns the probability of the samples for each class in
the model. The columns correspond to the classes in sorted
order, as they appear in the attribute `classes_`.
"""
likelihood = self.__predict_likelihood(x)
return np.array([lh / np.sum(lh) for lh in likelihood])
# 測验结果
X = dataSet[:,0:-1]
y = dataSet[:,-1]
mlenb = MLENB()
mlenb.fit(X,y)
print(mlenb.predict(np.array([2,0])))
print(mlenb.predict_proba(np.array([2,0])))
[-1]
[[ 0.75 0.25]]
3.2 Multinomial Naive Bayes
用极大似然预计可能会出现所要预计的概率值为0的情况。
这时会影响到后验概率的计算结果,使分类产生偏差。这时。能够採用多项式模型,对先验概率和条件概率做一些平滑处理。详细公式为:
先验概率
如果输入样本的第
当中。
有个疑问:多项式朴素贝叶斯与李航《统计学习方法》中说的贝叶斯预计有啥差别?本文的方法是參考李航的贝叶斯预计。
python的多项式朴素贝叶斯的參考代码例如以下:
class MultinomialNB:
"""Naive Bayes classifier for multinomial models
Attributes
----------
class_prior_ : array, shape (n_classes, )
Smoothed empirical probability for each class.
class_count_: array, shape (n_classes,)
number of training samples observed in each class.
bayes_estimation_: array, shape(n_classes, n_features)
bayes estimations of each feature per class, each of element is a dict
"""
def __init__(self, alpha=1.0):
self.alpha_ = 1.0
def fit(self,X,y):
n_samples = X.shape[0]
n_features = X.shape[1]
n_classes = len(set(y))
self.class_count_ = np.empty(n_classes)
self.class_prior_ = np.empty(n_classes)
self.bayes_estimation_ = np.empty((n_classes,n_features),dtype=dict)
self.target_unique = np.unique(y)
for i in range(n_classes):
dataX_tu = X[y == self.target_unique[i]]
self.class_prior_[i] = (dataX_tu.shape[0] + self.alpha_) / (float(len(y)) + n_classes * self.alpha_)
self.class_count_[i] = dataX_tu.shape[0]
for j in range(n_features):
feature = dataX_tu[:,j]
feature_unique = np.unique(feature)
fp = {}
for f_item in feature_unique:
fp[f_item] = (list(feature).count(f_item) + self.alpha_) / (float(len(feature)) + len(feature_unique) * self.alpha_)
self.bayes_estimation_[i,j] = fp
return self
def __predict_likelihood(self,x):
if x.ndim == 1:
x = np.array([x])
n_samples = x.shape[0]
n_features = x.shape[1]
n_classes = len(self.class_count_)
likelihood = []
for x_item in x:
class_p = []
for i in range(n_classes):
p = self.class_prior_[i]
for j in range(n_features):
if x_item[j] in self.bayes_estimation_[i,j]:
p *= self.bayes_estimation_[i,j][x_item[j]]
else:
p *= 0
class_p.append(p)
likelihood.append(class_p)
return np.array(likelihood)
def predict(self,x):
likelihood = self.__predict_likelihood(x)
max_index = np.argmax(likelihood, axis=1)
return np.array([self.target_unique[i] for i in max_index])
def predict_proba(self,x):
likelihood = self.__predict_likelihood(x)
return np.array([lh / np.sum(lh) for lh in likelihood])
# 測验结果
X = dataSet[:,0:-1]
y = dataSet[:,-1]
mnb = MultinomialNB()
mnb.fit(X,y)
print(mnb.predict(np.array([2,0])))
print(mnb.predict_proba(np.array([2,0])))
[-1]
[[ 0.65116279 0.34883721]]
3.3 Gaussian Naive Bayes
当输入的特征是连续值的时候,我们无法用上面的方法来预计先验概率和条件概率,能够採用高斯模型。
高斯模型如果特征服从高斯分布。
其特征的似然预计例如以下所看到的:
当中。
其python代码例如以下:
class GaussianNB:
"""
Attributes
----------
class_prior_ : array, shape (n_classes,)
probability of each class.
class_count_ : array, shape (n_classes,)
number of training samples observed in each class.
theta_ : array, shape (n_classes, n_features)
mean of each feature per class
sigma_ : array, shape (n_classes, n_features)
variance of each feature per class
"""
def __init__(self):
pass
def fit(self, X, y):
n_samples = X.shape[0]
n_features = X.shape[1]
n_classes = len(set(y))
self.theta_ = np.zeros([n_classes,n_features])
self.sigma_ = np.zeros([n_classes,n_features])
self.class_prior = np.zeros(n_classes)
self.class_count = np.zeros(n_classes)
self.target_unique = np.unique(y)
for i in range(n_classes):
dataX_tu = X[y == self.target_unique[i]]
self.class_prior[i] = dataX_tu.shape[0] / float(len(y))
self.class_count[i] = dataX_tu.shape[0]
self.theta_[i,:] = np.mean(dataX_tu,axis=0)
self.sigma_[i,:] = np.var(dataX_tu,axis=0)
return self
def __predict_likelihood(self,x):
if x.ndim == 1:
x = np.array([x])
n_samples = x.shape[0]
likelihood = []
for x_item in x:
gaussian = np.exp(-(x_item-self.theta_)**2 / (2 * self.sigma_)) / np.sqrt(2*np.pi*self.sigma_)
p = np.exp(np.sum(np.log(gaussian),axis=1))
likelihood.append(self.class_prior * p)
return np.array(likelihood)
def predict(self,x):
likelihood = self.__predict_likelihood(x)
max_index = np.argmax(likelihood, axis=1)
return np.array([self.target_unique[i] for i in max_index])
def predict_proba(self,x):
likelihood = self.__predict_likelihood(x)
return np.array([lh / np.sum(lh) for lh in likelihood])
# 測验结果
X = dataSet[:,0:-1]
y = dataSet[:,-1]
gnb = GaussianNB()
gnb.fit(X,y)
print(gnb.predict(np.array([2,0])))
print(gnb.predict_proba(np.array([2,0])))
[-1]
[[ 0.74566865 0.25433135]]
3.4 Bernoulli Naive Bayes
5. Naive Bayes 注意事项
- Works only with categorical predictors, numerical predictors must be categorized or binned before use
- Works with the assumption of predictor independence, and thus cannot detect or account for relationships between the predictors, unlike a decision tree for example.