梯度下降法簡(jiǎn)介
以下是定義了一個(gè)損失函數(shù)以后咙好,參數(shù)theta對(duì)應(yīng)的損失函數(shù)J的值對(duì)應(yīng)的示例圖塑娇,我們需要找到使得損失函數(shù)值J取得最小值對(duì)應(yīng)的theta(這里是二維平面景用,也就是我們的參數(shù)只有一個(gè))
在直線(xiàn)方程中打肝,導(dǎo)數(shù)代表斜率 在曲線(xiàn)方程中娩缰,導(dǎo)數(shù)代表切線(xiàn)斜率 導(dǎo)數(shù)代表theta單位變化時(shí)灸撰,J相應(yīng)的變化
η太小,會(huì)減慢收斂學(xué)習(xí)速度
η太大拼坎,甚至導(dǎo)致不收斂
其他注意事項(xiàng)
- 并不是所有函數(shù)都有唯一的極值點(diǎn)
解決方案:
- 多次運(yùn)行浮毯,隨機(jī)化初始點(diǎn)
- 梯度下降法的初始點(diǎn)也是一個(gè)超參數(shù)
模擬梯度下降法
import numpy as np
import matplotlib.pyplot as plt
plot_x = np.linspace(-1., 6., 141)
plot_x
array([-1. , -0.95, -0.9 , -0.85, -0.8 , -0.75, -0.7 , -0.65, -0.6 ,
-0.55, -0.5 , -0.45, -0.4 , -0.35, -0.3 , -0.25, -0.2 , -0.15,
-0.1 , -0.05, 0. , 0.05, 0.1 , 0.15, 0.2 , 0.25, 0.3 ,
0.35, 0.4 , 0.45, 0.5 , 0.55, 0.6 , 0.65, 0.7 , 0.75,
0.8 , 0.85, 0.9 , 0.95, 1. , 1.05, 1.1 , 1.15, 1.2 ,
1.25, 1.3 , 1.35, 1.4 , 1.45, 1.5 , 1.55, 1.6 , 1.65,
1.7 , 1.75, 1.8 , 1.85, 1.9 , 1.95, 2. , 2.05, 2.1 ,
2.15, 2.2 , 2.25, 2.3 , 2.35, 2.4 , 2.45, 2.5 , 2.55,
2.6 , 2.65, 2.7 , 2.75, 2.8 , 2.85, 2.9 , 2.95, 3. ,
3.05, 3.1 , 3.15, 3.2 , 3.25, 3.3 , 3.35, 3.4 , 3.45,
3.5 , 3.55, 3.6 , 3.65, 3.7 , 3.75, 3.8 , 3.85, 3.9 ,
3.95, 4. , 4.05, 4.1 , 4.15, 4.2 , 4.25, 4.3 , 4.35,
4.4 , 4.45, 4.5 , 4.55, 4.6 , 4.65, 4.7 , 4.75, 4.8 ,
4.85, 4.9 , 4.95, 5. , 5.05, 5.1 , 5.15, 5.2 , 5.25,
5.3 , 5.35, 5.4 , 5.45, 5.5 , 5.55, 5.6 , 5.65, 5.7 ,
5.75, 5.8 , 5.85, 5.9 , 5.95, 6. ])
plot_y = (plot_x-2.5)**2 - 1.
plt.plot(plot_x, plot_y)
plt.show()
epsilon = 1e-8 #10的負(fù)8次方
eta = 0.1
def J(theta):
return (theta-2.5)**2 - 1.
def dJ(theta):
return 2*(theta-2.5)
theta = 0.0
while True:
gradient = dJ(theta)
last_theta = theta
theta = theta - eta * gradient
if(abs(J(theta) - J(last_theta)) < epsilon):
break
print(theta)
print(J(theta))
2.499891109642585
-0.99999998814289
theta = 0.0
theta_history = [theta]
while True:
gradient = dJ(theta)
last_theta = theta
theta = theta - eta * gradient
theta_history.append(theta)
if(abs(J(theta) - J(last_theta)) < epsilon):
break
plt.plot(plot_x, J(plot_x))
plt.plot(np.array(theta_history), J(np.array(theta_history)), color="r", marker='+')
plt.show()
len(theta_history)
46
theta_history = []
def gradient_descent(initial_theta, eta, epsilon=1e-8):
theta = initial_theta
theta_history.append(initial_theta)
while True:
gradient = dJ(theta)
last_theta = theta
theta = theta - eta * gradient
theta_history.append(theta)
if(abs(J(theta) - J(last_theta)) < epsilon):
break
def plot_theta_history():
plt.plot(plot_x, J(plot_x))
plt.plot(np.array(theta_history), J(np.array(theta_history)), color="r", marker='+')
plt.show()
eta = 0.01
theta_history = []
gradient_descent(0, eta)
plot_theta_history()
len(theta_history)
424
eta = 0.001
theta_history = []
gradient_descent(0, eta)
plot_theta_history()
len(theta_history)
3682
eta = 0.8
theta_history = []
gradient_descent(0, eta)
plot_theta_history()
eta = 1.1
theta_history = []
gradient_descent(0, eta)
---------------------------------------------------------------------------
OverflowError Traceback (most recent call last)
<ipython-input-41-81ad09e9d3e0> in <module>()
1 eta = 1.1
2 theta_history = []
----> 3 gradient_descent(0, eta)
<ipython-input-35-37822183f4df> in gradient_descent(initial_theta, eta, epsilon)
11 theta_history.append(theta)
12
---> 13 if(abs(J(theta) - J(last_theta)) < epsilon):
14 break
15
<ipython-input-32-e8b6a6f56c24> in J(theta)
1 def J(theta):
----> 2 return (theta-2.5)**2 - 1.
3
4 def dJ(theta):
5 return 2*(theta-2.5)
OverflowError: (34, 'Result too large')
def J(theta):
try:
return (theta-2.5)**2 - 1.
except:
return float('inf')
def gradient_descent(initial_theta, eta, n_iters = 1e4, epsilon=1e-8):
theta = initial_theta
i_iter = 0
theta_history.append(initial_theta)
while i_iter < n_iters:
gradient = dJ(theta)
last_theta = theta
theta = theta - eta * gradient
theta_history.append(theta)
if(abs(J(theta) - J(last_theta)) < epsilon):
break
i_iter += 1
return
eta = 1.1
theta_history = []
gradient_descent(0, eta)
len(theta_history)
10001
eta = 1.1
theta_history = []
gradient_descent(0, eta, n_iters=10)
plot_theta_history()
多元線(xiàn)性回歸中的梯度下降法
一個(gè)三維空間中的梯度下降法(x,y為系數(shù),z為損失函數(shù))
推導(dǎo)過(guò)程
上面推導(dǎo)出的式子的大小是和樣本數(shù)有關(guān)的泰鸡,m越大债蓝,結(jié)果越大,這是不合理的盛龄,我們希望和m無(wú)關(guān)
在線(xiàn)性回歸模型中使用梯度下降法
import numpy as np
import matplotlib.pyplot as plt
np.random.seed(666)
x = 2 * np.random.random(size=100)
y = x * 3. + 4. + np.random.normal(size=100)
X = x.reshape(-1, 1)
X[:20]
array([[ 1.40087424],
[ 1.68837329],
[ 1.35302867],
[ 1.45571611],
[ 1.90291591],
[ 0.02540639],
[ 0.8271754 ],
[ 0.09762559],
[ 0.19985712],
[ 1.01613261],
[ 0.40049508],
[ 1.48830834],
[ 0.38578401],
[ 1.4016895 ],
[ 0.58645621],
[ 1.54895891],
[ 0.01021768],
[ 0.22571531],
[ 0.22190734],
[ 0.49533646]])
y[:20]
array([ 8.91412688, 8.89446981, 8.85921604, 9.04490343, 8.75831915,
4.01914255, 6.84103696, 4.81582242, 3.68561238, 6.46344854,
4.61756153, 8.45774339, 3.21438541, 7.98486624, 4.18885101,
8.46060979, 4.29706975, 4.06803046, 3.58490782, 7.0558176 ])
plt.scatter(x, y)
plt.show()
使用梯度下降法訓(xùn)練
由于本次傳入的是真實(shí)數(shù)據(jù)饰迹,每個(gè)特征之間的值差距很大,依然會(huì)造成overflow
def J(theta, X_b, y):
try:
return np.sum((y - X_b.dot(theta))**2) / len(X_b)
except:
return float('inf')
def dJ(theta, X_b, y):
res = np.empty(len(theta))
res[0] = np.sum(X_b.dot(theta) - y)
for i in range(1, len(theta)):
res[i] = (X_b.dot(theta) - y).dot(X_b[:,i])
return res * 2 / len(X_b)
def gradient_descent(X_b, y, initial_theta, eta, n_iters = 1e4, epsilon=1e-8):
theta = initial_theta
cur_iter = 0
while cur_iter < n_iters:
gradient = dJ(theta, X_b, y)
last_theta = theta
theta = theta - eta * gradient
if(abs(J(theta, X_b, y) - J(last_theta, X_b, y)) < epsilon):
break
cur_iter += 1
return theta
X_b = np.hstack([np.ones((len(x), 1)), x.reshape(-1,1)])
initial_theta = np.zeros(X_b.shape[1])
eta = 0.01
theta = gradient_descent(X_b, y, initial_theta, eta)
theta
array([ 4.02145786, 3.00706277])
封裝我們的線(xiàn)性回歸算法
import numpy as np
from .metrics import r2_score
class LinearRegression:
def __init__(self):
"""初始化Linear Regression模型"""
self.coef_ = None
self.intercept_ = None
self._theta = None
def fit_gd(self, X_train, y_train, eta=0.01, n_iters=1e4):
"""根據(jù)訓(xùn)練數(shù)據(jù)集X_train, y_train, 使用梯度下降法訓(xùn)練Linear Regression模型"""
assert X_train.shape[0] == y_train.shape[0], \
"the size of X_train must be equal to the size of y_train"
def J(theta, X_b, y):
try:
return np.sum((y - X_b.dot(theta)) ** 2) / len(y)
except:
return float('inf')
def dJ(theta, X_b, y):
res = np.empty(len(theta))
res[0] = np.sum(X_b.dot(theta) - y)
for i in range(1, len(theta)):
res[i] = (X_b.dot(theta) - y).dot(X_b[:, i])
return res * 2 / len(X_b)
def gradient_descent(X_b, y, initial_theta, eta, n_iters=1e4, epsilon=1e-8):
theta = initial_theta
cur_iter = 0
while cur_iter < n_iters:
gradient = dJ(theta, X_b, y)
last_theta = theta
theta = theta - eta * gradient
if (abs(J(theta, X_b, y) - J(last_theta, X_b, y)) < epsilon):
break
cur_iter += 1
return theta
X_b = np.hstack([np.ones((len(X_train), 1)), X_train])
initial_theta = np.zeros(X_b.shape[1])
self._theta = gradient_descent(X_b, y_train, initial_theta, eta, n_iters)
self.intercept_ = self._theta[0]
self.coef_ = self._theta[1:]
return self
def predict(self, X_predict):
"""給定待預(yù)測(cè)數(shù)據(jù)集X_predict余舶,返回表示X_predict的結(jié)果向量"""
assert self.intercept_ is not None and self.coef_ is not None, \
"must fit before predict!"
assert X_predict.shape[1] == len(self.coef_), \
"the feature number of X_predict must be equal to X_train"
X_b = np.hstack([np.ones((len(X_predict), 1)), X_predict])
return X_b.dot(self._theta)
def score(self, X_test, y_test):
"""根據(jù)測(cè)試數(shù)據(jù)集 X_test 和 y_test 確定當(dāng)前模型的準(zhǔn)確度"""
y_predict = self.predict(X_test)
return r2_score(y_test, y_predict)
def __repr__(self):
return "LinearRegression()"
測(cè)試我們的線(xiàn)性回歸算法
from playML.LinearRegression import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit_gd(X, y)
LinearRegression()
lin_reg.coef_
array([ 3.00706277])
lin_reg.intercept_
4.021457858204859
4.3 向量化
上面展開(kāi)的行向量是根據(jù)式子一共有m個(gè)逐一展開(kāi)的啊鸭,然后點(diǎn)乘的方塊式子就是X_b, 展開(kāi)后的點(diǎn)乘就是原來(lái)左邊的式子, 這是一個(gè)
1xm
的向量dot一個(gè)mxn+1
的矩陣匿值,得到的是一個(gè)1xn+1
的行向量赠制, 雖然numpy不區(qū)分行向量還是列向量,在這里最后的式子是將向量進(jìn)行了整體的轉(zhuǎn)置挟憔,然后進(jìn)行展開(kāi)得到的結(jié)果
修改之前的求導(dǎo)函數(shù)
def dJ(theta, X_b, y):
## res = np.empty(len(theta))
## res[0] = np.sum(X_b.dot(theta) - y)
##
## for i in range(1, len(theta)):
## res[i] = np.sum((X_b.dot(theta) - y).dot(X_b[:, i]))
##
## return res * 2 / len(X_b)
return X_b.T.dot(X_b.dot(theta) - y) * 2. / len(X_b)
梯度下降法的向量化
import numpy as np
from sklearn import datasets
boston = datasets.load_boston()
X = boston.data
y = boston.target
X = X[y < 50.0]
y = y[y < 50.0]
from playML.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, seed=666)
from playML.LinearRegression import LinearRegression
lin_reg1 = LinearRegression()
%time lin_reg1.fit_normal(X_train, y_train)
lin_reg1.score(X_test, y_test)
CPU times: user 45.6 ms, sys: 5.74 ms, total: 51.3 ms
Wall time: 56.4 ms
0.81298026026584658
使用梯度下降法
lin_reg2 = LinearRegression()
lin_reg2.fit_gd(X_train, y_train)
/Users/yuanzhang/Dropbox/code/My MOOC/Play-with-Machine-Learning-Algorithms/06-Gradient-Descent/05-Vectorize-Gradient-Descent/playML/LinearRegression.py:32: RuntimeWarning: overflow encountered in square
return np.sum((y - X_b.dot(theta)) ** 2) / len(y)
/Users/yuanzhang/Dropbox/code/My MOOC/Play-with-Machine-Learning-Algorithms/06-Gradient-Descent/05-Vectorize-Gradient-Descent/playML/LinearRegression.py:48: RuntimeWarning: invalid value encountered in double_scalars
if (abs(J(theta, X_b, y) - J(last_theta, X_b, y)) < epsilon):
LinearRegression()
lin_reg2.coef_
array([ nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
nan, nan])
lin_reg2.fit_gd(X_train, y_train, eta=0.000001)
LinearRegression()
lin_reg2.score(X_test, y_test)
0.27556634853389195
%time lin_reg2.fit_gd(X_train, y_train, eta=0.000001, n_iters=1e6)
CPU times: user 48.4 s, sys: 265 ms, total: 48.6 s
Wall time: 49.9 s
LinearRegression()
lin_reg2.score(X_test, y_test)
0.75418523539807636
使用真實(shí)的數(shù)據(jù)钟些,調(diào)整eta和iters,要么由于eta太小導(dǎo)致無(wú)法得出真實(shí)的結(jié)果绊谭,要么由于eta太大導(dǎo)致訓(xùn)練時(shí)間加長(zhǎng)政恍,這是由于數(shù)據(jù)的規(guī)模在不同的特征上不同,所以我們需要對(duì)數(shù)據(jù)進(jìn)行歸一化
數(shù)據(jù)歸一化
使用梯度下降法前進(jìn)行數(shù)據(jù)歸一化
from sklearn.preprocessing import StandardScaler
standardScaler = StandardScaler()
standardScaler.fit(X_train)
X_train_standard = standardScaler.transform(X_train)
lin_reg3 = LinearRegression()
%time lin_reg3.fit_gd(X_train_standard, y_train)
CPU times: user 237 ms, sys: 4.37 ms, total: 242 ms
Wall time: 258 ms
LinearRegression()
X_test_standard = standardScaler.transform(X_test)
lin_reg3.score(X_test_standard, y_test)
0.81298806201222351
梯度下降法的優(yōu)勢(shì)
m = 1000
n = 5000
big_X = np.random.normal(size=(m, n))
true_theta = np.random.uniform(0.0, 100.0, size=n+1)
big_y = big_X.dot(true_theta[1:]) + true_theta[0] + np.random.normal(0., 10., size=m)
big_reg1 = LinearRegression()
%time big_reg1.fit_normal(big_X, big_y)
CPU times: user 18.8 s, sys: 899 ms, total: 19.7 s
Wall time: 10.9 s
LinearRegression()
big_reg2 = LinearRegression()
%time big_reg2.fit_gd(big_X, big_y)
CPU times: user 9.51 s, sys: 121 ms, total: 9.63 s
Wall time: 5.76 s
LinearRegression()
如果樣本數(shù)非常多龙誊,那么即使使用梯度下降法也會(huì)導(dǎo)致速度比較慢抚垃,因?yàn)樵谔荻认陆捣ㄖ校恳粋€(gè)樣本都要參與運(yùn)算。這時(shí)候需要采用隨機(jī)梯度下降法鹤树。
隨機(jī)梯度下降法
隨機(jī)梯度下降法介紹
批量梯度下降法帶來(lái)的一個(gè)問(wèn)題是η的值需要設(shè)置的比較小铣焊,在樣本數(shù)比較多的時(shí)候?qū)е虏皇撬俣忍貏e慢,這時(shí)候觀察隨機(jī)梯度下降法損失函數(shù)的求導(dǎo)公式罕伯,可以發(fā)現(xiàn)曲伊,我們對(duì)每一個(gè)Xb都做了求和操作,又在最外面除以了m追他,那么可以考慮將求和和除以m的兩個(gè)運(yùn)算約掉坟募,采用每次使用一個(gè)隨機(jī)的Xb
由于我們使用的事隨機(jī)梯度下降法,所以導(dǎo)致我們的最終結(jié)果不會(huì)像批量梯度下降法一樣準(zhǔn)確的朝著一個(gè)方向運(yùn)算邑狸,而是曲線(xiàn)行下降懈糯,這時(shí)候我們就希望,越到下面单雾,η值相應(yīng)減小赚哗,事運(yùn)算次數(shù)變多,從而精確計(jì)算結(jié)果
這里使用了模擬退火的思想
隨機(jī)梯度下降法:它的具體思路是在更新每一參數(shù)時(shí)都使用一個(gè)樣本來(lái)進(jìn)行更新硅堆,也就是方程中的m等于1屿储。每一次跟新參數(shù)都用一個(gè)樣本,更新很多次渐逃。如果樣本量很大的情況(例如幾十萬(wàn))够掠,那么可能只用其中幾萬(wàn)條或者幾千條的樣本,就已經(jīng)將theta迭代到最優(yōu)解了茄菊,對(duì)比上面的批量梯度下降疯潭,迭代一次需要用到十幾萬(wàn)訓(xùn)練樣本,一次迭代不可能最優(yōu)买羞,如果迭代10次的話(huà)就需要遍歷訓(xùn)練樣本10次袁勺,這種跟新方式計(jì)算復(fù)雜度太高。
但是畜普,SGD伴隨的一個(gè)問(wèn)題是噪音較BGD要多期丰,使得SGD并不是每次迭代都向著整體最優(yōu)化方向。
優(yōu)點(diǎn):訓(xùn)練速度快吃挑;
缺點(diǎn):準(zhǔn)確度下降钝荡,并不是全局最優(yōu);不易于并行實(shí)現(xiàn)舶衬。
從迭代的次數(shù)上來(lái)看埠通,SGD迭代的次數(shù)較多,在解空間的搜索過(guò)程看起來(lái)很盲目逛犹。
批量梯度下降法(Batch Gradient Descent端辱,簡(jiǎn)稱(chēng)BGD)是梯度下降法最原始的形式梁剔,它的具體思路是在更新每一參數(shù)時(shí)都使用所有的樣本來(lái)進(jìn)行更新,也就是方程(1)中的m表示樣本的所有個(gè)數(shù)舞蔽。
優(yōu)點(diǎn):全局最優(yōu)解荣病;易于并行實(shí)現(xiàn);
缺點(diǎn):當(dāng)樣本數(shù)目很多時(shí)渗柿,訓(xùn)練過(guò)程會(huì)很慢个盆。
小批量梯度下降法(Mini-batch Gradient Descent,簡(jiǎn)稱(chēng)MBGD):它的具體思路是在更新每一參數(shù)時(shí)都使用一部分樣本來(lái)進(jìn)行更新朵栖,也就是方程(1)中的m的值大于1小于所有樣本的數(shù)量颊亮。為了克服上面兩種方法的缺點(diǎn),又同時(shí)兼顧兩種方法的有點(diǎn)陨溅。
- 從公式中可以看出终惑,批量梯度下降算法每迭代一步,要用到訓(xùn)練集的所有樣本声登,最后得到的是一個(gè)全局最優(yōu)解狠鸳。
- 隨機(jī)梯度的權(quán)值更新公式里調(diào)整項(xiàng)沒(méi)有累加符號(hào),說(shuō)明隨機(jī)梯度下降中只用到了訓(xùn)練集的一個(gè)樣本悯嗓,最后得到的可能是全局最優(yōu)解,也可能是局部最優(yōu)解卸察。
隨機(jī)梯度下降法
import numpy as np
import matplotlib.pyplot as plt
m = 100000
x = np.random.normal(size=m)
X = x.reshape(-1,1)
y = 4.*x + 3. + np.random.normal(0, 3, size=m)
plt.scatter(x, y)
plt.show()
普通梯度下降法
def J(theta, X_b, y):
try:
return np.sum((y - X_b.dot(theta)) ** 2) / len(y)
except:
return float('inf')
def dJ(theta, X_b, y):
return X_b.T.dot(X_b.dot(theta) - y) * 2. / len(y)
def gradient_descent(X_b, y, initial_theta, eta, n_iters=1e4, epsilon=1e-8):
theta = initial_theta
cur_iter = 0
while cur_iter < n_iters:
gradient = dJ(theta, X_b, y)
last_theta = theta
theta = theta - eta * gradient
if (abs(J(theta, X_b, y) - J(last_theta, X_b, y)) < epsilon):
break
cur_iter += 1
return theta
%%time
X_b = np.hstack([np.ones((len(X), 1)), X])
initial_theta = np.zeros(X_b.shape[1])
eta = 0.01
theta = gradient_descent(X_b, y, initial_theta, eta)
CPU times: user 2.81 s, sys: 134 ms, total: 2.94 s
Wall time: 1.88 s
theta
array([ 3.00383464, 4.01706856])
隨機(jī)梯度下降法
def dJ_sgd(theta, X_b_i, y_i):
return 2 * X_b_i.T.dot(X_b_i.dot(theta) - y_i)
def sgd(X_b, y, initial_theta, n_iters):
t0, t1 = 5, 50
def learning_rate(t):
return t0 / (t + t1)
theta = initial_theta
for cur_iter in range(n_iters):
rand_i = np.random.randint(len(X_b))
gradient = dJ_sgd(theta, X_b[rand_i], y[rand_i])
theta = theta - learning_rate(cur_iter) * gradient
return theta
%%time
X_b = np.hstack([np.ones((len(X), 1)), X])
initial_theta = np.zeros(X_b.shape[1])
theta = sgd(X_b, y, initial_theta, n_iters=m//3)
CPU times: user 559 ms, sys: 22.6 ms, total: 582 ms
Wall time: 647 ms
theta
array([ 3.04732375, 4.03214249])
隨機(jī)梯度下降法的封裝和測(cè)試
def fit_sgd(self, X_train, y_train, n_iters=5, t0=5, t1=50):
"""
根據(jù)訓(xùn)練數(shù)據(jù)集X_train, y_train, 使用隨機(jī)梯度下降法訓(xùn)練Linear Regression模型
:param X_train:
:param y_train:
:param n_iters: 在隨機(jī)梯度下降法中脯厨,n_iters代表所有的樣本會(huì)被看幾圈
:param t0:
:param t1:
:return:
"""
assert X_train.shape[0] == y_train.shape[0], \
"the size of X_train must be equal to the size of y_train"
assert n_iters >= 1
def dJ_sgd(theta, X_b_i, y_i):
"""
去X_b,y 中的隨機(jī)一個(gè)元素進(jìn)行導(dǎo)數(shù)公式的計(jì)算
:param theta:
:param X_b_i:
:param y_i:
:return:
"""
return X_b_i * (X_b_i.dot(theta) - y_i) * 2.
def sgd(X_b, y, initial_theta, n_iters, t0=5, t1=50):
def learning_rate(t):
"""
計(jì)算學(xué)習(xí)率,t1 為了減慢變化速度坑质,t0為了增加隨機(jī)性
:param t: 第t次循環(huán)
:return:
"""
return t0 / (t + t1)
theta = initial_theta
m = len(X_b)
for cur_iter in range(n_iters):
## 對(duì)X_b進(jìn)行一個(gè)亂序的排序
indexes = np.random.permutation(m)
X_b_new = X_b[indexes]
y_new = y[indexes]
## 對(duì)整個(gè)數(shù)據(jù)集看一遍
for i in range(m):
gradient = dJ_sgd(theta, X_b_new[i], y_new[i])
theta = theta - learning_rate(cur_iter * m + i) * gradient
return theta
X_b = np.hstack([np.ones((len(X_train), 1)), X_train])
initial_theta = np.random.randn(X_b.shape[1])
self._theta = sgd(X_b, y_train, initial_theta, n_iters, t0, t1)
self.interception_ = self._theta[0]
self.coef_ = self._theta[1:]
return self
封裝我們自己的SGD
import numpy as np
import matplotlib.pyplot as plt
m = 100000
x = np.random.normal(size=m)
X = x.reshape(-1,1)
y = 4.*x + 3. + np.random.normal(0, 3, size=m)
from playML.LinearRegression import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit_bgd(X, y)
print(lin_reg.intercept_, lin_reg.coef_)
3.01644183437 [ 3.99995653]
lin_reg = LinearRegression()
lin_reg.fit_sgd(X, y, n_iters=2)
print(lin_reg.intercept_, lin_reg.coef_)
2.99558568395 [ 4.02610767]
真實(shí)使用我們自己的SGD
from sklearn import datasets
boston = datasets.load_boston()
X = boston.data
y = boston.target
X = X[y < 50.0]
y = y[y < 50.0]
from playML.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, seed=666)
from sklearn.preprocessing import StandardScaler
standardScaler = StandardScaler()
standardScaler.fit(X_train)
X_train_standard = standardScaler.transform(X_train)
X_test_standard = standardScaler.transform(X_test)
from playML.LinearRegression import LinearRegression
lin_reg = LinearRegression()
%time lin_reg.fit_sgd(X_train_standard, y_train, n_iters=2)
lin_reg.score(X_test_standard, y_test)
CPU times: user 11.9 ms, sys: 4.13 ms, total: 16.1 ms
Wall time: 13.5 ms
0.78651716204682975
%time lin_reg.fit_sgd(X_train_standard, y_train, n_iters=50)
lin_reg.score(X_test_standard, y_test)
CPU times: user 155 ms, sys: 8.11 ms, total: 163 ms
Wall time: 158 ms
0.80857287165738345
%time lin_reg.fit_sgd(X_train_standard, y_train, n_iters=100)
lin_reg.score(X_test_standard, y_test)
CPU times: user 282 ms, sys: 5.11 ms, total: 287 ms
Wall time: 287 ms
0.81294846132723497
scikit-learn中的SGD
from sklearn.linear_model import SGDRegressor
sgd_reg = SGDRegressor()
%time sgd_reg.fit(X_train_standard, y_train)
sgd_reg.score(X_test_standard, y_test)
CPU times: user 631 μs, sys: 54 μs, total: 685 μs
Wall time: 690 μs
0.80478459701573024
sgd_reg = SGDRegressor(n_iter=50)
%time sgd_reg.fit(X_train_standard, y_train)
sgd_reg.score(X_test_standard, y_test)
CPU times: user 4.5 ms, sys: 1.04 ms, total: 5.54 ms
Wall time: 3.94 ms
0.81199073931878407
需要注意的是sklearn中的梯度下降法比我們自己的算法要復(fù)雜的多合武,性能和計(jì)算準(zhǔn)確度上都比我們的要好,我們的算法只是用來(lái)演示過(guò)程涡扼,具體生產(chǎn)上的使用還是應(yīng)該使用Sklearn提供的
梯度下降法 的調(diào)試
梯度下降法調(diào)試的原理
可能我們計(jì)算出梯度下降法的公式稼跳,并使用python編程實(shí)現(xiàn),預(yù)測(cè)的過(guò)程中程序并沒(méi)有報(bào)錯(cuò)吃沪,但是可能我們需要求的梯度的結(jié)果是錯(cuò)誤的汤善,這個(gè)時(shí)候需要怎么樣去調(diào)試發(fā)現(xiàn)錯(cuò)誤呢。
首先以二維坐標(biāo)平面為例票彪,一個(gè)點(diǎn)(O)的導(dǎo)數(shù)就是曲線(xiàn)在這個(gè)點(diǎn)的切線(xiàn)的斜率红淡,在這個(gè)點(diǎn)兩側(cè)各取一個(gè)點(diǎn)(AB),那么AB兩點(diǎn)對(duì)應(yīng)的直線(xiàn)的斜率應(yīng)該大體等于O的切線(xiàn)的斜率降铸,并且這A和B的距離越近在旱,那么兩條直線(xiàn)的斜率就越接近 事實(shí)上,這也正是導(dǎo)數(shù)的定義推掸,當(dāng)函數(shù)y=f(x)的自變量x在一點(diǎn)x0上產(chǎn)生一個(gè)增量Δx時(shí)桶蝎,函數(shù)輸出值的增量Δy與自變量增量Δx的比值在Δx趨于0時(shí)的極限a如果存在驻仅,a即為在x0處的導(dǎo)數(shù),記作f'(x0)或df(x0)/dx
擴(kuò)展到多維維度則如下
梯度下降法調(diào)試
關(guān)于梯度的計(jì)算調(diào)試
import numpy as np
import matplotlib.pyplot as plt
np.random.seed(666)
X = np.random.random(size=(1000, 10))
true_theta = np.arange(1, 12, dtype=float)
X_b = np.hstack([np.ones((len(X), 1)), X])
y = X_b.dot(true_theta) + np.random.normal(size=1000)
true_theta
array([ 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11.])
X.shape
(1000, 10)
y.shape
(1000,)
def J(theta, X_b, y):
try:
return np.sum((y - X_b.dot(theta))**2) / len(X_b)
except:
return float('inf')
def dJ_math(theta, X_b, y):
return X_b.T.dot(X_b.dot(theta) - y) * 2. / len(y)
def dJ_debug(theta, X_b, y, epsilon=0.01):
res = np.empty(len(theta))
for i in range(len(theta)):
theta_1 = theta.copy()
theta_1[i] += epsilon
theta_2 = theta.copy()
theta_2[i] -= epsilon
res[i] = (J(theta_1, X_b, y) - J(theta_2, X_b, y)) / (2 * epsilon)
return res
def gradient_descent(dJ, X_b, y, initial_theta, eta, n_iters = 1e4, epsilon=1e-8):
theta = initial_theta
cur_iter = 0
while cur_iter < n_iters:
gradient = dJ(theta, X_b, y)
last_theta = theta
theta = theta - eta * gradient
if(abs(J(theta, X_b, y) - J(last_theta, X_b, y)) < epsilon):
break
cur_iter += 1
return theta
X_b = np.hstack([np.ones((len(X), 1)), X])
initial_theta = np.zeros(X_b.shape[1])
eta = 0.01
%time theta = gradient_descent(dJ_debug, X_b, y, initial_theta, eta)
theta
CPU times: user 13.8 s, sys: 283 ms, total: 14.1 s
Wall time: 7.6 s
array([ 1.1251597 , 2.05312521, 2.91522497, 4.11895968,
5.05002117, 5.90494046, 6.97383745, 8.00088367,
8.86213468, 9.98608331, 10.90529198])
%time theta = gradient_descent(dJ_math, X_b, y, initial_theta, eta)
theta
CPU times: user 1.57 s, sys: 30.6 ms, total: 1.6 s
Wall time: 856 ms
array([ 1.1251597 , 2.05312521, 2.91522497, 4.11895968,
5.05002117, 5.90494046, 6.97383745, 8.00088367,
8.86213468, 9.98608331, 10.90529198])
由此可以看出登渣,我們的dJ_debug和dJ_math的結(jié)果是相近的雾家,所以我們的dJ_math的數(shù)學(xué)推導(dǎo)是沒(méi)問(wèn)題的。
我們可以在真正的機(jī)器學(xué)習(xí)之前绍豁,先使用dJ_debug這種調(diào)試方式來(lái)驗(yàn)證一下我們的dJ_math的結(jié)果是否正確芯咧,然后再進(jìn)行機(jī)器學(xué)習(xí)。
dJ_debug是通用的竹揍,可以放在任何求導(dǎo)的debug過(guò)程中敬飒,所以可以作為我們機(jī)器學(xué)習(xí)的工具箱來(lái)使用.