KNN最鄰近結(jié)點算法
一、算法原理
kNN算法的核心思想是如果一個樣本在特征空間中的k個最相鄰的樣本中的大多數(shù)屬于某一個類別,則該樣本也屬于這個類別,并具有這個類別上樣本的特性。
二换吧、算法實現(xiàn)
import numpy as np
from math import sqrt
from collections import Counter
def kNN_classify(k, X_train, y_train, x):
assert 1 <= k <= X_train.shape[0], "ERROR: k must be valid!"
assert X_train.shape[0] == y_train.shape[0], "ERROR: the size of X_train must equal to the size of y_train!"
assert X_train.shape[1] == x.shape[0], "ERROR: the feature number of x must be equal to X_train!"
# 求被測點到所有樣本點的歐氏距離
distances = [sqrt(np.sum((x_train - x)**2)) for x_train in X_train]
# 返回按距離升序排列后的索引列表
nearest = np.argsort(distances)
# 返回前k小的距離
topK_y = [y_train[i] for i in nearest[:k]]
# collections的Counter()方法:求出數(shù)組的相同元素的個數(shù),返回一個dict:{key=元素名钥星,value=元素個數(shù)}
votes = Counter(topK_y)
# most_common()方法:求出最多的元素對應(yīng)的那個鍵值對
return votes.most_common(1)[0][0]
示例:
# 數(shù)據(jù)集
# 特征
raw_data_x= [[3.393533211,2.331273381],
[2.110073483,1.781539638],
[1.343808831,3.368360954],
[3.582294042,4.679179110],
[2.280362439,2.866990263],
[7.423436942,4.696522875],
[5.745051997,3.533989803],
[9.172168622,2.511101045],
[7.792783481,3.424088941],
[7.939820817,0.791637231]
]
# 所屬類別
raw_data_y = [0,0,0,0,0,1,1,1,1,1]
X_train = np.array(raw_data_x)
y_train = np.array(raw_data_y)
# 預(yù)測
x = np.array([8.093607318,3.365731514])
predict = kNN_classify(6,X_train,y_train,x)
predict
三沾瓦、scikit-learn KNN
from sklearn.neighbors import KNeighborsClassifier
kNN_classifier = KNeighborsClassifier(n_neighbors=6)
kNN_classifier.fit(X_train, y_train)
示例:
訓(xùn)練數(shù)據(jù)集同上
# 預(yù)測
x = np.array([[8.093607318,3.365731514]])
kNN_classifier.predict(x)
四、鳶尾花示例
1.數(shù)據(jù)準(zhǔn)備
(1)導(dǎo)入數(shù)據(jù)
from sklearn.datasets import fetch_openml
iris = fetch_openml(name='iris')
(2)初步了解數(shù)據(jù)
(3)切分?jǐn)?shù)據(jù)集
①以花萼長度、寬度為特征值
X = iris.data[:, : 2]
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
y = iris.target
y = encoder.fit_transform(y)
# 數(shù)據(jù)可視化
plt.scatter(X[y==0,0], X[y==0,1], color="red", marker="o")
plt.scatter(X[y==1,0], X[y==1,1], color="blue", marker="+")
plt.scatter(X[y==2,0], X[y==2,1], color="green", marker="x")
plt.show()
②以花瓣長度暴拄、寬度為特征值
X = iris.data[:, 2:]
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
y = iris.target
y = encoder.fit_transform(y)
# 數(shù)據(jù)可視化
plt.scatter(X[y==0,0], X[y==0,1], color="red", marker="o")
plt.scatter(X[y==1,0], X[y==1,1], color="blue", marker="+")
plt.scatter(X[y==2,0], X[y==2,1], color="green", marker="x")
plt.show()
- 訓(xùn)練集測試集切分
def train_test_set_split(X, y, test_ratio=0.2, seed=None):
assert X.shape[0] == y.shape[0], "ERROR: the size of X must be equal to the size of y!"
assert 0.0 <= test_ratio <= 1.0, "ERROR :test_radio must be valid!"
if seed:
np.random.seed(seed)
# 返回隨機打亂排列后的索引
shuffle_indexes = np.random.permutation(len(X))
tets_size = int(len(X) * test_ratio)
test_indexes = shuffle_indexes[:tets_size]
train_indexes = shuffle_indexes[tets_size:]
X_train = X[train_indexes]
y_train = y[train_indexes]
X_test = X[test_indexes]
y_test = y[test_indexes]
return X_train, y_train, X_test, y_test
X_train, y_train, X_test, y_test = train_test_set_split(X, y, 0.2)
或者使用scikit-learn封裝方法
import machine_learning
from machine_learning.module_selection import train_test_split
X_train, y_train, X_test, y_test = train_test_split(X, y, test_radio=0.2)
(4)訓(xùn)練模型
from sklearn.neighbors import KNeighborsClassifier
knn_clf = KNeighborsClassifier(n_neighbors=6)
knn_clf.fit(X_train, y_train)
(5)測試模型
y_predict = knn_clf.predict(X_test)
print("模型分類準(zhǔn)確率為{}".format(sum(y_predict==y_test)/len(y_test)))