- 李航:
樸素貝葉斯(naive bayes)法是基于貝葉斯定理與特征條件獨(dú)立假設(shè)的分類方法逆济。
對于給定的訓(xùn)練數(shù)據(jù)集,首先基于特征條件獨(dú)立假設(shè)學(xué)習(xí)學(xué)習(xí)輸入輸出的聯(lián)合概率分 布;然后基于此模型,對給定的輸入X,利用貝葉斯定理求出后延概率最大的輸入y词爬。
樸素貝葉斯法實(shí)現(xiàn)簡單,學(xué)習(xí)效率高权均。
python代碼書中4.2例題
import numpy as np
class bayes(object):
def __init__(self, data, label, num_class, L):
# data : (list) samples_nums * [features_nums]
self.data = data
self.label = label
self.num_class = num_class
self.L = L
self.p_prams = []
self.p_label = np.zeros(self.num_class)
self.fea_condition = []
self.model = self.__model()
def __model(self):
self.__get_p_gram()
def __get_p_gram(self):
self.__generation_features_conditional()
start = 0
for i in range(self.num_class):
# i 代表第i類
Ik = list(self.label).count(i)
self.p_label[i] = (Ik + self.L)/ (len(self.label) + self.num_class * self.L)
end = Ik + start
index_sort = np.argsort(self.label)
condition_k = []#condition_k 是一個列表顿膨,保存第I類每個特征不同取值的概率
for index, condition in enumerate(self.fea_condition):
temp = self.data[index_sort[start:end], index].reshape(1, -1)[0]
condition_kj = [] # 保存第index個特征不同取值的概率
for c in condition:
condition_kj.append((list(temp).count(c) + self.L)/(end - start + len(condition) * self.L))
condition_k.append(condition_kj)
start = end
self.p_prams.append(condition_k)
def __generation_features_conditional(self):
#找出每種特征出現(xiàn)的所有情況
features_nums = self.data.shape[1]
for j in range(features_nums):
# j代表第j個特征
self.fea_condition.append(np.unique(self.data[:, j]))
def classify(self, target):
p = list(self.p_label)
for index, _ in enumerate(p):
for fea_index,fea in enumerate(list(target)):
fea_local = list(self.fea_condition[index]).index(fea)#每個特征值在所屬的S集合中的位置
p[index] *= self.p_prams[index][fea_index][fea_local]
c = np.asarray(p).argsort()[-1]
return p, c
data = np.array([[1, 1], [1, 2], [1, 2], [1, 1], [1, 1], [2, 1], [2, 2], [2, 2],[2, 3], [2, 3], [3, 3], [3, 2], [3, 2], [3, 3], [3, 3]])
label = np.array([0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0])
target = [2, 1]
num_class = 2
L = 1 #拉普拉斯平滑參數(shù)
model = bayes(data, label, num_class, L)
p, c = model.classify(target)
print('Target belong %s, \nP is %s.\n' % (c, p[c]))
sklearn代碼所用數(shù)據(jù)為kaggle中mnist數(shù)據(jù),將特征PCA至六維
# -*- coding: utf-8 -*-
"""
使用sklearn實(shí)現(xiàn)的貝葉斯算法進(jìn)行分類的一個實(shí)例叽赊,
使用數(shù)據(jù)集是Kaggle數(shù)字手寫體數(shù)據(jù)庫
"""
import pandas as pd
import numpy as np
from sklearn.naive_bayes import GaussianNB
from sklearn.decomposition import PCA
import sklearn
# 加載數(shù)據(jù)集
def load_data(filename, n, mode):
data_pd = pd.read_csv(filename)
data = np.asarray(data_pd)
pca = PCA(n_components=n)
if not mode == 'test':
dateset = pca.fit_transform(data[:, 1:])
return dateset, data[:, 0]
else:
dateset = pca.fit_transform(data)
return dateset, 1
def main(train_data_path, test_data_path, n_dim):
train_data, train_label = load_data(train_data_path, n_dim, 'train')
print("Train set :" + repr(len(train_data)))
test_data, _ = load_data(test_data_path, n_dim, 'test')
print("Test set :" + repr(len(test_data)))
bys = GaussianNB()
# 訓(xùn)練數(shù)據(jù)集
bys.fit(train_data, train_label)
# 訓(xùn)練準(zhǔn)確率
score = bys.score(train_data, train_label)
print(">Training accuracy = " + repr(score))
predictions = []
for index in range(len(test_data)):
# 預(yù)測
result = bys.predict([test_data[index]])
predict = bys.predict_proba([test_data[index]])
predictions.append([index + 1, result[0]])
print(">Index : %s, predicted = %s" % (index + 1, result[0]))
columns = ['ImageId', 'Label']
save_file = pd.DataFrame(columns=columns, data=predictions)
save_file.to_csv('bys.csv', index=False, encoding="utf-8")
if __name__ == "__main__":
train_data_path = 'train.csv'
test_data_path = 'test.csv'
n_dim = 6
main(train_data_path, test_data_path, n_dim)
喜歡的關(guān)注點(diǎn)贊哈