一. 使用sklearn中的數(shù)據(jù)集做案例
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
# 使用seaborn繪制默認(rèn)值
import seaborn as sns; sns.set()
#隨機(jī)來點(diǎn)數(shù)據(jù)
from sklearn.datasets.samples_generator import make_blobs
X, y = make_blobs(n_samples=50, centers=2,
random_state=0, cluster_std=0.60)
plt.scatter(X[:, 0], X[:, 1], c=y, s=50, cmap='autumn')
隨便的畫幾條分割線翠桦,看看哪個(gè)好滓侍?
xfit = np.linspace(-1, 3.5)
plt.scatter(X[:, 0], X[:, 1], c=y, s=50, cmap='autumn')
plt.plot([0.6], [2.1], 'x', color='red', markeredgewidth=2, markersize=10)
for m, b in [(1, 0.65), (0.5, 1.6), (-0.2, 2.9)]:
plt.plot(xfit, m * xfit + b, '-k')
plt.xlim(-1, 3.5);
再進(jìn)一步觀察決策邊界的面積, 畫出陰影
xfit = np.linspace(-1, 3.5)
plt.scatter(X[:, 0], X[:, 1], c=y, s=50, cmap='autumn')
for m, b, d in [(1, 0.65, 0.33), (0.5, 1.6, 0.55), (-0.2, 2.9, 0.2)]:
yfit = m * xfit + b
plt.plot(xfit, yfit, '-k')
plt.fill_between(xfit, yfit - d, yfit + d, edgecolor='none',
color='#AAAAAA', alpha=0.4)
plt.xlim(-1, 3.5);
二. 訓(xùn)練SVM
from sklearn.svm import SVC # "Support vector classifier"
model = SVC(kernel='linear')
model.fit(X, y)
#繪圖函數(shù)
def plot_svc_decision_function(model, ax=None, plot_support=True):
"""繪制二維SVC的決策函數(shù)"""
if ax is None:
ax = plt.gca()
xlim = ax.get_xlim()
ylim = ax.get_ylim()
# 創(chuàng)建網(wǎng)格來評(píng)估模型
x = np.linspace(xlim[0], xlim[1], 30)
y = np.linspace(ylim[0], ylim[1], 30)
Y, X = np.meshgrid(y, x)
xy = np.vstack([X.ravel(), Y.ravel()]).T
P = model.decision_function(xy).reshape(X.shape)
# 繪制決策邊界和小區(qū)域
ax.contour(X, Y, P, colors='k',
levels=[-1, 0, 1], alpha=0.5,
linestyles=['--', '-', '--'])
# 繪制支持向量
if plot_support:
ax.scatter(model.support_vectors_[:, 0],
model.support_vectors_[:, 1],
s=300, linewidth=1, facecolors='none');
ax.set_xlim(xlim)
ax.set_ylim(ylim)
plt.scatter(X[:, 0], X[:, 1], c=y, s=50, cmap='autumn')
plot_svc_decision_function(model)
這條線就是我們希望得到的決策邊界啦
觀察發(fā)現(xiàn)有3個(gè)點(diǎn)做了特殊的標(biāo)記蹋盆,它們恰好都是邊界上的點(diǎn)
它們就是我們的support vectors(支持向量)
在Scikit-Learn中, 它們存儲(chǔ)在這個(gè)位置 support_vectors_(一個(gè)屬性)
model.support_vectors_
- 觀察可以發(fā)現(xiàn),只需要支持向量我們就可以把模型構(gòu)建出來
三. 接下來我們嘗試一下,用不同多的數(shù)據(jù)點(diǎn)弧呐,看看效果會(huì)不會(huì)發(fā)生變化,分別使用60個(gè)和120個(gè)數(shù)據(jù)點(diǎn)
def plot_svm(N=10, ax=None):
X, y = make_blobs(n_samples=200, centers=2,
random_state=0, cluster_std=0.60)
X = X[:N]
y = y[:N]
model = SVC(kernel='linear', C=1E10)
model.fit(X, y)
ax = ax or plt.gca()
ax.scatter(X[:, 0], X[:, 1], c=y, s=50, cmap='autumn')
ax.set_xlim(-1, 4)
ax.set_ylim(-1, 6)
plot_svc_decision_function(model, ax)
fig, ax = plt.subplots(1, 2, figsize=(16, 6))
fig.subplots_adjust(left=0.0625, right=0.95, wspace=0.1)
for axi, N in zip(ax, [60, 120]):
plot_svm(N, axi)
axi.set_title('N = {0}'.format(N))
觀察發(fā)現(xiàn)管引,只要支持向量沒變士败,其他的數(shù)據(jù)怎么加無所謂!
四. 引入核函數(shù)的SVM
首先我們先用線性的核來看一下在下面這樣比較難的數(shù)據(jù)集上還能分了嗎褥伴?
from sklearn.datasets.samples_generator import make_circles
X, y = make_circles(100, factor=.1, noise=.1)
clf = SVC(kernel='linear').fit(X, y)
plt.scatter(X[:, 0], X[:, 1], c=y, s=50, cmap='autumn')
plot_svc_decision_function(clf, plot_support=False)
顯然不行, 那接下來試試高維的核變換
我們可以先用一個(gè)三維圖來表示這個(gè)額外的數(shù)據(jù)維度:
#加入了新的維度r
from mpl_toolkits import mplot3d
r = np.exp(-(X ** 2).sum(1))
def plot_3D(elev=30, azim=30, X=X, y=y):
ax = plt.subplot(projection='3d')
ax.scatter3D(X[:, 0], X[:, 1], r, c=y, s=50, cmap='autumn')
ax.view_init(elev=elev, azim=azim)
ax.set_xlabel('x')
ax.set_ylabel('y')
ax.set_zlabel('r')
plot_3D(elev=45, azim=45, X=X, y=y)
#加入徑向基函數(shù)
clf = SVC(kernel='rbf', C=1E6)
clf.fit(X, y)
#再看結(jié)果
plt.scatter(X[:, 0], X[:, 1], c=y, s=50, cmap='autumn')
plot_svc_decision_function(clf)
plt.scatter(clf.support_vectors_[:, 0], clf.support_vectors_[:, 1],
s=300, lw=1, facecolors='none');
使用這種核支持向量機(jī)谅将,我們學(xué)習(xí)一個(gè)合適的非線性決策邊界。這種核變換策略在機(jī)器學(xué)習(xí)中經(jīng)常被使用重慢!
五. 調(diào)節(jié)SVM參數(shù): Soft Margin問題
調(diào)節(jié)C參數(shù)
- 當(dāng)C趨近于無窮大時(shí):意味著分類嚴(yán)格不能有錯(cuò)誤
- 當(dāng)C趨近于很小的時(shí):意味著可以有更大的錯(cuò)誤容忍
X, y = make_blobs(n_samples=100, centers=2,
random_state=0, cluster_std=0.8)
plt.scatter(X[:, 0], X[:, 1], c=y, s=50, cmap='autumn');
X, y = make_blobs(n_samples=100, centers=2,
random_state=0, cluster_std=0.8)
fig, ax = plt.subplots(1, 2, figsize=(16, 6))
fig.subplots_adjust(left=0.0625, right=0.95, wspace=0.1)
for axi, C in zip(ax, [10.0, 0.1]):
model = SVC(kernel='linear', C=C).fit(X, y)
axi.scatter(X[:, 0], X[:, 1], c=y, s=50, cmap='autumn')
plot_svc_decision_function(model, axi)
axi.scatter(model.support_vectors_[:, 0],
model.support_vectors_[:, 1],
s=300, lw=1, facecolors='none');
axi.set_title('C = {0:.1f}'.format(C), size=14)
調(diào)節(jié)C參數(shù)對(duì)比
X, y = make_blobs(n_samples=100, centers=2,
random_state=0, cluster_std=1.1)
fig, ax = plt.subplots(1, 2, figsize=(16, 6))
fig.subplots_adjust(left=0.0625, right=0.95, wspace=0.1)
for axi, gamma in zip(ax, [10.0, 0.1]):
model = SVC(kernel='rbf', gamma=gamma).fit(X, y)
axi.scatter(X[:, 0], X[:, 1], c=y, s=50, cmap='autumn')
plot_svc_decision_function(model, axi)
axi.scatter(model.support_vectors_[:, 0],
model.support_vectors_[:, 1],
s=300, lw=1, facecolors='none');
axi.set_title('gamma = {0:.1f}'.format(gamma), size=14)
調(diào)節(jié)gamma參數(shù)對(duì)比