線性回歸
線性回歸輸出是一個(gè)連續(xù)值线召,因此適用于回歸問題《喟回歸問題在實(shí)際中很常見缓淹,如預(yù)測(cè)房屋價(jià)格、氣溫塔逃、銷售額等連續(xù)值的問題讯壶。與回歸問題不同,分類問題中模型的最終輸出是一個(gè)離散值湾盗。我們所說的圖像分類伏蚊、垃圾郵件識(shí)別、疾病檢測(cè)等輸出為離散值的問題都屬于分類問題的范疇淹仑。softmax回歸則適用于分類問題丙挽。
模型
損失函數(shù)(這個(gè)二分之一是為了求導(dǎo)后消除二人為加上的)
平均損失
平均損失最小
優(yōu)化算法
大多數(shù)深度學(xué)習(xí)模型并沒有解析解,只能通過優(yōu)化算法有限次迭代模型參數(shù)來盡可能降低損失函數(shù)的值匀借,引入小批量隨機(jī)梯度下降(mini-batch stochastic gradient descent)颜阐。
1.先選取一組模型參數(shù)的初始值。
2.來對(duì)參數(shù)進(jìn)行多次迭代吓肋,使每次迭代都可能降低損失函數(shù)的值凳怨。
2.1先隨機(jī)均勻采樣一個(gè)數(shù)目訓(xùn)練數(shù)據(jù)樣本所組成的小批量(mini-batch)。
2.2求小批量中數(shù)據(jù)樣本的平均損 失有關(guān)模型參數(shù)的導(dǎo)數(shù)(梯度)是鬼。
2.3最后用此結(jié)果與預(yù)先設(shè)定的一個(gè)正數(shù)的乘積作為模型參數(shù)在本次迭代的減小量肤舞。
神經(jīng)網(wǎng)絡(luò)來表示回歸
這是一個(gè)單層的神經(jīng)網(wǎng)絡(luò)且輸出層又為全連接層。
PyTorch實(shí)現(xiàn)線性回歸
import torch
import numpy as np
import random
#生成數(shù)據(jù)集合
num_inputs = 2 #特征個(gè)數(shù)
num_examples = 1000 #樣本個(gè)數(shù)
true_w = [2,-1.2] #真實(shí)權(quán)重
true_b = 3.1 #真實(shí)偏差
features = torch.randn(num_examples, num_inputs,
dtype=torch.float32)
labels = true_w[0] * features[:, 0] + true_w[1] * features[:, 1] + true_b
labels += torch.tensor(np.random.normal(0, 0.01, size=labels.size()),
dtype=torch.float32)
#讀取數(shù)據(jù)集
def data_iter(batch_size, features, labels):
num_examples = len(features)
indices = list(range(num_examples))
random.shuffle(indices)
for i in range(0, num_examples, batch_size):
j = torch.LongTensor(indices[i: min(i + batch_size, num_examples)])
yield features.index_select(0, j), labels.index_select(0, j)
batch_size = 10
for X, y in data_iter(batch_size, features, labels):
print(X, '\n', y)
break
batch_size = 10
for X, y in data_iter(batch_size, features, labels):
print(X, '\n', y)
break
#定義模型
def linreg(X, w, b):
return torch.mm(X, w) + b
#定義損失函數(shù)
def squared_loss(y_hat, y):
return (y_hat - y.view(y_hat.size())) ** 2 / 2
#定義優(yōu)化函數(shù)
def sgd(params, lr, batch_size):
for param in params:
param.data -= lr * param.grad / batch_size
#定義模型
def linreg(X, w, b):
return torch.mm(X, w) + b
#定義損失函數(shù)
def squared_loss(y_hat, y):
return (y_hat - y.view(y_hat.size())) ** 2 / 2
#定義優(yōu)化函數(shù)
def sgd(params, lr, batch_size):
for param in params:
param.data -= lr * param.grad / batch_size
# training
for epoch in range(num_epochs): # training repeats num_epochs times
# in each epoch, all the samples in dataset will be used once
# X is the feature and y is the label of a batch sample
for X, y in data_iter(batch_size, features, labels):
l = loss(net(X, w, b), y).sum()
# calculate the gradient of batch sample loss
l.backward()
# using small batch random gradient descent to iter model parameters
sgd([w, b], lr, batch_size)
# reset parameter gradient
w.grad.data.zero_()
b.grad.data.zero_()
train_l = loss(net(features, w, b), labels)
print('epoch %d, loss %f' % (epoch + 1, train_l.mean().item()))
Softmax與分類模型
softmax回歸的輸出單元從一個(gè)變成了多個(gè)均蜜,且引入了softmax運(yùn)算使輸出更適合離散值的預(yù)測(cè)和訓(xùn)練李剖。
模型
網(wǎng)絡(luò)(單層且輸出層為全連接層)
softmax運(yùn)算符(softmax operator)
將輸出值變換成值為正且和為1的概率分布,
矢量表達(dá)式
交叉熵(cross entropy)
交叉熵?fù)p失函數(shù)定
PyTorch分類模型
import torch
import torchvision
import numpy as np
import sys
sys.path.append("/home/kesci/input")
import d2lzh as d2l
#安裝d2lzh需要有mxnet
batch_size = 256
train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size, root='/home/kesci/input/FashionMNIST2065')
num_inputs = 784
print(28*28)
num_outputs = 10
W = torch.tensor(np.random.normal(0, 0.01, (num_inputs, num_outputs)), dtype=torch.float)
b = torch.zeros(num_outputs, dtype=torch.float)
W.requires_grad_(requires_grad=True)
b.requires_grad_(requires_grad=True)
X = torch.tensor([[1, 2, 3], [4, 5, 6]])
print(X.sum(dim=0, keepdim=True)) # dim為0囤耳,按照相同的列求和篙顺,并在結(jié)果中保留列特征
print(X.sum(dim=1, keepdim=True)) # dim為1偶芍,按照相同的行求和,并在結(jié)果中保留行特征
print(X.sum(dim=0, keepdim=False)) # dim為0德玫,按照相同的列求和匪蟀,不在結(jié)果中保留列特征
print(X.sum(dim=1, keepdim=False)) # dim為1,按照相同的行求和宰僧,不在結(jié)果中保留行特征
def softmax(X):
X_exp = X.exp()
partition = X_exp.sum(dim=1, keepdim=True)
# print("X size is ", X_exp.size())
# print("partition size is ", partition, partition.size())
return X_exp / partition # 這里應(yīng)用了廣播機(jī)制
X = torch.rand((2, 5))
X_prob = softmax(X)
print(X_prob, '\n', X_prob.sum(dim=1))
def net(X):
return softmax(torch.mm(X.view((-1, num_inputs)), W) + b)
y_hat = torch.tensor([[0.1, 0.3, 0.6], [0.3, 0.2, 0.5]])
y = torch.LongTensor([0, 2])
y_hat.gather(1, y.view(-1, 1))
def cross_entropy(y_hat, y):
return - torch.log(y_hat.gather(1, y.view(-1, 1)))
def accuracy(y_hat, y):
return (y_hat.argmax(dim=1) == y).float().mean().item()
print(accuracy(y_hat, y))
#0.5
# 本函數(shù)已保存在d2lzh_pytorch包中方便以后使用材彪。該函數(shù)將被逐步改進(jìn):它的完整實(shí)現(xiàn)將在“圖像增廣”一節(jié)中描述
def evaluate_accuracy(data_iter, net):
acc_sum, n = 0.0, 0
for X, y in data_iter:
acc_sum += (net(X).argmax(dim=1) == y).float().sum().item()
n += y.shape[0]
return acc_sum / n
num_epochs, lr = 5, 0.1
# 本函數(shù)已保存在d2lzh_pytorch包中方便以后使用
def train_ch3(net, train_iter, test_iter, loss, num_epochs, batch_size,
params=None, lr=None, optimizer=None):
for epoch in range(num_epochs):
train_l_sum, train_acc_sum, n = 0.0, 0.0, 0
for X, y in train_iter:
y_hat = net(X)
l = loss(y_hat, y).sum()
# 梯度清零
if optimizer is not None:
optimizer.zero_grad()
elif params is not None and params[0].grad is not None:
for param in params:
param.grad.data.zero_()
l.backward()
if optimizer is None:
d2l.sgd(params, lr, batch_size)
else:
optimizer.step()
train_l_sum += l.item()
train_acc_sum += (y_hat.argmax(dim=1) == y).sum().item()
n += y.shape[0]
test_acc = evaluate_accuracy(test_iter, net)
print('epoch %d, loss %.4f, train acc %.3f, test acc %.3f'
% (epoch + 1, train_l_sum / n, train_acc_sum / n, test_acc))
train_ch3(net, train_iter, test_iter, cross_entropy, num_epochs, batch_size, [W, b], lr)
X, y = iter(test_iter).next()
true_labels = d2l.get_fashion_mnist_labels(y.numpy())
pred_labels = d2l.get_fashion_mnist_labels(net(X).argmax(dim=1).numpy())
titles = [true + '\n' + pred for true, pred in zip(true_labels, pred_labels)]
d2l.show_fashion_mnist(X[0:9], titles[0:9])
多層感知機(jī)(multilayer perceptron,MLP)
隱藏層(hidden layer)
含有一個(gè)隱藏層琴儿,該層中有5個(gè)隱藏單元段化。
模型
模型推導(dǎo)
引入激活函數(shù),便再添加更多的隱藏層凤类,以上設(shè)計(jì)依然只能與僅含輸出層的單層神經(jīng)網(wǎng)絡(luò)等價(jià)穗泵。根源在于全連接層只是對(duì)數(shù)據(jù)做仿射變換affine transformation,而多個(gè)仿射變換 的疊加仍然是一個(gè)仿射變換谜疤。
ReLU(rectified linear unit)
ReLU函數(shù)只保留正數(shù)元素佃延,并將負(fù)數(shù)元素清零。
sigmoid函數(shù)
tanh函數(shù)
tanh(雙曲正切)函數(shù)可以將元素的值變換到-1和1之間夷磕。
多層感知機(jī)
多層感知機(jī)就是含有至少一個(gè)隱藏層的由全連接層組成的神經(jīng)網(wǎng)絡(luò)履肃,且每個(gè)隱藏層的輸出通過激 活函數(shù)進(jìn)行變換。多層感知機(jī)的層數(shù)和各隱藏層中隱藏單元個(gè)數(shù)都是超參數(shù)坐桩。
多層感知機(jī)的從零開始實(shí)現(xiàn)
import torch
import numpy as np
import sys
sys.path.append("/home/kesci/input")
import d2lzh as d2l
batch_size = 256
train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size,root='/home/kesci/input/FashionMNIST2065')
num_inputs, num_outputs, num_hiddens = 784, 10, 256
W1 = torch.tensor(np.random.normal(0, 0.01, (num_inputs, num_hiddens)), dtype=torch.float)
b1 = torch.zeros(num_hiddens, dtype=torch.float)
W2 = torch.tensor(np.random.normal(0, 0.01, (num_hiddens, num_outputs)), dtype=torch.float)
b2 = torch.zeros(num_outputs, dtype=torch.float)
params = [W1, b1, W2, b2]
for param in params:
param.requires_grad_(requires_grad=True)
def relu(X):
return torch.max(input=X, other=torch.tensor(0.0))
def net(X):
X = X.view((-1, num_inputs))
H = relu(torch.matmul(X, W1) + b1)
return torch.matmul(H, W2) + b2
loss = torch.nn.CrossEntropyLoss()
num_epochs, lr = 5, 100.0
# def train_ch3(net, train_iter, test_iter, loss, num_epochs, batch_size,
# params=None, lr=None, optimizer=None):
# for epoch in range(num_epochs):
# train_l_sum, train_acc_sum, n = 0.0, 0.0, 0
# for X, y in train_iter:
# y_hat = net(X)
# l = loss(y_hat, y).sum()
#
# # 梯度清零
# if optimizer is not None:
# optimizer.zero_grad()
# elif params is not None and params[0].grad is not None:
# for param in params:
# param.grad.data.zero_()
#
# l.backward()
# if optimizer is None:
# d2l.sgd(params, lr, batch_size)
# else:
# optimizer.step() # “softmax回歸的簡(jiǎn)潔實(shí)現(xiàn)”一節(jié)將用到
#
#
# train_l_sum += l.item()
# train_acc_sum += (y_hat.argmax(dim=1) == y).sum().item()
# n += y.shape[0]
# test_acc = evaluate_accuracy(test_iter, net)
# print('epoch %d, loss %.4f, train acc %.3f, test acc %.3f'
# % (epoch + 1, train_l_sum / n, train_acc_sum / n, test_acc))
d2l.train_ch3(net, train_iter, test_iter, loss, num_epochs, batch_size, params, lr)