# -*- coding: utf-8 -*-
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import classification_report
from sklearn.naive_bayes import BernoulliNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cross_validation import train_test_split
import matplotlib.pyplot as plt
import pandas as pd
####knn最鄰近算法####
inputfile = 'd:/data/sales_data.xls'
data = pd.read_excel(inputfile, index_col = u'序號(hào)') #導(dǎo)入數(shù)據(jù)
#數(shù)據(jù)是類(lèi)別標(biāo)簽项阴,要將它轉(zhuǎn)換為數(shù)據(jù)
#用1來(lái)表示“好”滑黔、“是”、“高”這三個(gè)屬性环揽,用-1來(lái)表示“壞”略荡、“否”、“低”
data[data == u'好'] = 1
data[data == u'是'] = 1
data[data == u'高'] = 1
data[data != 1] = -1
x = data.iloc[:,:3].as_matrix().astype(int)
y = data.iloc[:,3].as_matrix().astype(int)
#拆分訓(xùn)練數(shù)據(jù)與測(cè)試數(shù)據(jù)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2)
#訓(xùn)練KNN分類(lèi)器
clf = KNeighborsClassifier(algorithm='kd_tree')
clf.fit(x_train, y_train)
#測(cè)試結(jié)果
answer = clf.predict(x_test)
print(x_test)
print(answer)
print(y_test)
print(np.mean( answer == y_test))
#準(zhǔn)確率
precision, recall, thresholds = precision_recall_curve(y_train, clf.predict(x_train))
print(classification_report(y_test, answer, target_names = ['高', '低']))
####貝葉斯分類(lèi)器####
#訓(xùn)練貝葉斯分類(lèi)器
clf = BernoulliNB()
clf.fit(x_train,y_train)
#測(cè)試結(jié)果
answer = clf.predict(x_test)
print(x_test)
print(answer)
print(y_test)
print(np.mean( answer == y_test))
print(classification_report(y_test, answer, target_names = ['低', '高']))
####決策樹(shù)####
from sklearn.tree import DecisionTreeClassifier as DTC
dtc = DTC(criterion='entropy') #建立決策樹(shù)模型歉胶,基于信息熵
dtc.fit(x_train, y_train) #訓(xùn)練模型
#導(dǎo)入相關(guān)函數(shù)汛兜,可視化決策樹(shù)。
#導(dǎo)出的結(jié)果是一個(gè)dot文件通今,需要安裝Graphviz才能將它轉(zhuǎn)換為pdf或png等格式序无。
from sklearn.tree import export_graphviz
from sklearn.externals.six import StringIO
with open("tree.dot", 'w') as f:
f = export_graphviz(dtc, out_file = f)
#測(cè)試結(jié)果
answer = dtc.predict(x_test)
print(x_test)
print(answer)
print(y_test)
print(np.mean( answer == y_test))
print(classification_report(y_test, answer, target_names = ['低', '高']))
####SVM####
from sklearn.svm import SVC
clf =SVC()
clf.fit(x_train, y_train)
#測(cè)試結(jié)果
answer = clf.predict(x_test)
print(x_test)
print(answer)
print(y_test)
print(np.mean( answer == y_test))
print(classification_report(y_test, answer, target_names = ['低', '高']))