混肴矩陣
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
iris = load_iris()
clf = LogisticRegression()
clf.fit(iris.data,iris.target)
predicted = clf.predict(iris.data)
# 建立邏輯回歸模型
sum(iris.target == predicted)/len(iris.target) # 計(jì)算準(zhǔn)確率
from sklearn.metrics import accuracy_score
accuracy_score(iris.target,predicted)
# 使用sklearn內(nèi)置accuracy_score計(jì)算準(zhǔn)確率
# 注意波势,準(zhǔn)確率并沒有多大意義
from sklearn.metrics import confusion_matrix
m=confusion_matrix(iris.target,predicted)
# 得到邏輯回歸模型混肴矩陣
%pylab inline
import seaborn
seaborn.heatmap(m) # 產(chǎn)生可視化混肴矩陣
from sklearn.metrics import classification_report
print(classification_report(iris.target,predicted))
# 分類報(bào)告废离,得到分類結(jié)果的準(zhǔn)確率廊勃,召回率滑进,F(xiàn)1钢颂,判斷模型好壞
交叉驗(yàn)證
Holdout驗(yàn)證
隨機(jī)選取大部分?jǐn)?shù)據(jù)作訓(xùn)練數(shù)據(jù)集钞它,剩余數(shù)據(jù)做驗(yàn)證數(shù)據(jù)集
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
iris = load_iris()
X = iris.data
y = iris.target # 依然使用自帶dataset中的iris數(shù)據(jù)
from sklearn.model_selection import train_test_split
train_X,test_X,train_y,test_y=train_test_split(X,y,test_size = 0.33,random_state =123)
# train_test_split 將數(shù)據(jù)X,y分成訓(xùn)練數(shù)據(jù)和驗(yàn)證數(shù)據(jù)殊鞭,test_size 是驗(yàn)證數(shù)據(jù)集占總數(shù)據(jù)比例遭垛,random_state隨便輸,不同的值會(huì)產(chǎn)生不同數(shù)據(jù)集
clf = DecisionTreeClassifier()
clf.fit(train_X,train_y) # 使用訓(xùn)練數(shù)據(jù)集訓(xùn)練決策樹模型
from sklearn.metrics import accuracy_score
predicted = clf.predict(test_X)
accuracy_score(test_y,predicted) # 計(jì)算模型對(duì)驗(yàn)證數(shù)據(jù)集的準(zhǔn)確率
from sklearn.metrics import confusion_matrix
m = confusion_matrix(test_y,predicted) #模型對(duì)驗(yàn)證數(shù)據(jù)集的混肴矩陣
print(m)
交叉驗(yàn)證
將數(shù)據(jù)隨機(jī)分成N份操灿,將N-1份作為訓(xùn)練數(shù)據(jù)锯仪,1份作為驗(yàn)證數(shù)據(jù),重復(fù)N次后平均
from sklearn.model_selection import KFold
kf = KFold(n_splits=10) #將數(shù)據(jù)分成10份
acc=[]
for train,test in kf.split(X):
train_X,test_X,train_y,test_y = X[train],X[test],y[train],y[test]
clf= DecisionTreeClassifier()
clf.fit(train_X,train_y)
predicted = clf.predict(test_X)
acc.append(accuracy_score(test_y,predicted))
print(sum(acc)/len(acc)) #打印出驗(yàn)證的準(zhǔn)確率的平均值
另一種方法
from sklearn.model_selection import cross_val_score
acc = cross_val_score(clf,X=iris.data,y=iris.target,cv=10) #cv=10 表示做10次交叉驗(yàn)證
# acc 為10次交叉驗(yàn)證準(zhǔn)確率的array
print(acc.mean())
留一驗(yàn)證
N-1個(gè)數(shù)據(jù)做訓(xùn)練趾盐,1個(gè)數(shù)據(jù)做驗(yàn)證庶喜,重復(fù)N次(相當(dāng)與交叉驗(yàn)證分成N(=數(shù)據(jù)量)份)
from sklearn.model_selection import LeaveOneOut
res = []
loo = LeaveOneOut()
for train,test in loo.split(X):
train_X,test_X,train_y,test_y = X[train],X[test],y[train],y[test]
clf= DecisionTreeClassifier()
clf.fit(train_X,train_y)
predicted = clf.predict(test_X)
res.extend((predicted==test_y).tolist())
sum(res)
ROC曲線評(píng)價(jià)分類模型
生成ROC曲線
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn import preprocessing
iris = load_iris()
X = iris.data[50:150,] # ROC曲線使用二維混肴矩陣,選擇2個(gè)分類的數(shù)據(jù)
le = preprocessing.LabelEncoder()
y = le.fit_transform(iris.target[50:150]) # 選擇的數(shù)據(jù)target值為1和2救鲤,使用preprocessing轉(zhuǎn)換成0和1
from sklearn.model_selection import train_test_split
train_X,test_X,train_y,test_y=train_test_split(X,y,test_size = 0.33,random_state =123)
clf= DecisionTreeClassifier()
clf.fit(train_X,train_y)
probas_ = clf.fit(train_X,train_y).predict_proba(test_X)
from sklearn.metrics import roc_curve,auc
fpr,tpr,thresholds = roc_curve(test_y,probas_[:,1]) #生成false positive rate 和true positive rate
import matplotlib.pyplot as plt
plt.plot(fpr,tpr,label='ROC curve')
plt.plot([0,1],[0,1],'k--')
plt.xlim([0.0,1.0])
plt.ylim([0.0,1.0])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc='lower right')
plt.show()
計(jì)算auc(areas under curve)
auc越大模型越準(zhǔn)確
from sklearn.metrics import auc
roc_auc = auc(fpr,tpr)
print('Area under the curve:{}'.format(roc_auc))
不同模型ROC曲線對(duì)比
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
clf1 = DecisionTreeClassifier()
clf1.fit(train_X,train_y)
clf2 = SVC(probability =True)
clf2.fit(train_X,train_y)
clf3 = LogisticRegression()
clf3.fit(train_X,train_y)
clf4 = RandomForestClassifier()
clf4.fit(train_X,train_y)
from sklearn.metrics import roc_curve,auc
plt.figure(figsize=[20,10])
for clf,title in zip([clf1,clf2,clf3,clf4],['Decision Tree','SVM','LogisticRegression','RandomForest']):
probas_ = clf.fit(train_X,train_y).predict_proba(test_X)
fpr,tpr,thresholds = roc_curve(test_y,probas_[:,1])
plt.plot(fpr,tpr,label='%s-AUC:%.2f'%(title,auc(fpr,tpr)))
plt.plot([0,1],[0,1],'k--')
plt.xlim([0.0,1.0])
plt.ylim([0.0,1.0])
plt.xlabel('False Positive Rate',fontsize=20)
plt.ylabel('True Positive Rate',fontsize=20)
plt.title('ROC Curve',fontsize=20)
plt.legend(loc='lower right',fontsize=20)
plt.show()
按模型中維度重要性排序
import numpy as np
columns = np.array(iris.feature_names) # 將feature_names由列表變?yōu)閍rray
importance = columns[clf1.feature_importances_.argsort()[::-1]]
# clf1.feature_importances_ 生成各個(gè)特征的重要性
# argsort() 獲得array中的值按從小到大在array中的位置的array久窟。
# [::-1]將上面的array逆排序
# columns[clf1.feature_importances_.argsort()[::-1]] 得到按照importance從大到小排序的array
print(importance)
#特征維度重要性排序可視化
import matplotlib.pyplot as plt
featur_importance = clf1.feature_importances_
plt.title('Feature Importance')
plt.bar(range(0,len(importance)),feature_importance[feature_importance.argsort()[::-1]])
plt.xticks(range(0,len(importance)),importance,rotation=90)
plt.show()