加載數(shù)據(jù)
import pandas as pd
df_wine = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data', header=None)
df_wine.columns = ['Class label', 'Alcohol',
'Malic acid', 'Ash',
'Alcalinity of ash', 'Magnesium',
'Total phenols', 'Flavanoids',
'Nonflavanoid phenols', 'Proanthocyanins',
'Color intensity', 'Hue',
'OD280/OD315 of diluted wines', 'Proline']
y = df_wine['Class label'].values
特征選擇
為了方便后面可視化,我們只選取2個特征雷厂,通過自變量與因變量y相關(guān)系數(shù)來選擇
# pearsonr可以計(jì)算相關(guān)系數(shù)與p值
# 當(dāng)p<0.01表示兩個變量強(qiáng)相關(guān)
from scipy.stats import pearsonr
lable=df_wine.values[:,0]
lr = []
for i, line in enumerate(df_wine.values.T):
lr.append([pearsonr(lable,line),i])
lr.sort()
X = df_wine[[df_wine.columns[lr[0][1]],df_wine.columns[lr[-2][1]]]].values
還可以通過PCA降維來選擇,本例降維后分類效果并不好
# pearsonr可以計(jì)算相關(guān)系數(shù)與p值
# 當(dāng)p<0.01表示兩個變量強(qiáng)相關(guān)
from scipy.stats import pearsonr
lable=df_wine.values[:,0]
lr = []
for i, line in enumerate(df_wine.values.T):
lr.append([pearsonr(lable,line),i])
lr.sort()
X = df_wine[[df_wine.columns[lr[0][1]],df_wine.columns[lr[-2][1]]]].values
因?yàn)檫@里有標(biāo)簽显歧,還可以通過LDA來降維選擇,效果比較好,數(shù)據(jù)分類達(dá)到100%正確
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
X = df_wine.iloc[:,range(1,len(df_wine.columns),1)].values
lda = LinearDiscriminantAnalysis(n_components=2)
X = lda.fit(X, y).transform(X)
調(diào)參,這里只調(diào)一個決策樹深度參數(shù)
from sklearn.preprocessing import LabelEncoder
from sklearn.cross_validation import train_test_split
from sklearn.grid_search import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
# # 拆分訓(xùn)練集的30%作為測試集
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3, random_state=1)
param_test1 = {'max_depth':range(1,20,1)}
gsearch1 = GridSearchCV(estimator = DecisionTreeClassifier(criterion="entropy",
random_state=10),
param_grid = param_test1,cv=10)
gsearch1.fit(X_train,y_train)
#print gsearch1.grid_scores_,
print gsearch1.best_params_
print gsearch1.best_score_
輸出
{'max_depth': 8}
0.822580645161
度量單個決策樹的準(zhǔn)確性
# 度量單個決策樹的準(zhǔn)確性
from sklearn.metrics import accuracy_score
tree = DecisionTreeClassifier(criterion="entropy", max_depth=gsearch1.best_params_['max_depth'])
tree = tree.fit(X_train, y_train)
y_train_pred = tree.predict(X_train)
y_test_pred = tree.predict(X_test)
tree_train = accuracy_score(y_train, y_train_pred)
tree_test = accuracy_score(y_test, y_test_pred)
print('Decision tree train/test accuracies %.3f/%.3f' % (tree_train, tree_test))
Decision tree train/test accuracies 0.984/0.815
# 生成50個決策樹,詳細(xì)的參數(shù)建議參考官方文檔
bag = BaggingClassifier(base_estimator=tree, n_estimators=50,
max_samples=1.0, max_features=1.0,
bootstrap=True, bootstrap_features=False,
n_jobs=1, random_state=1)
# 度量bagging分類器的準(zhǔn)確性
bag = bag.fit(X_train, y_train)
y_train_pred = bag.predict(X_train)
y_test_pred = bag.predict(X_test)
bag_train = accuracy_score(y_train, y_train_pred)
bag_test = accuracy_score(y_test, y_test_pred)
print('Bagging train/test accuracies %.3f/%.3f' % (bag_train, bag_test))
Bagging分類器的效果的確要比單個決策樹的效果好痘煤,提高了一點(diǎn)
Bagging train/test accuracies 1.000/0.852
Boosting分類器, Bagging是投票平均模式,Boosting
ada = AdaBoostClassifier(base_estimator=tree, n_estimators=1000, learning_rate=0.1, random_state=0)
ada = ada.fit(X_train, y_train)
y_train_pred = ada.predict(X_train)
y_test_pred = ada.predict(X_test)
ada_train = accuracy_score(y_train, y_train_pred)
ada_test = accuracy_score(y_test, y_test_pred)
print('AdaBoost train/test accuracies %.3f/%.3f' % (ada_train, ada_test))