LetNet詳見上篇卷積神經(jīng)網(wǎng)絡
LetNet存在缺陷:
在大的真實數(shù)據(jù)集上的表現(xiàn)并不盡如?意侍咱。
- 神經(jīng)網(wǎng)絡計算復雜。
- 還沒有?量深?研究參數(shù)初始化和?凸優(yōu)化算法等諸多領域驾胆。
機器學習的特征提燃谅颉:手工定義的特征提取函數(shù)
神經(jīng)網(wǎng)絡的特征提取:通過學習得到數(shù)據(jù)的多級表征添吗,并逐級表?越來越抽象的概念或模式。
AlexNet
首次證明了學習到的特征可以超越??設計的特征份名,從而?舉打破計算機視覺研究的前狀碟联。
特征:
- 8層變換,其中有5層卷積和2層全連接隱藏層僵腺,以及1個全連接輸出層鲤孵。
- 將sigmoid激活函數(shù)改成了更加簡單的ReLU激活函數(shù)。
- 用Dropout來控制全連接層的模型復雜度辰如。
- 引入數(shù)據(jù)增強普监,如翻轉、裁剪和顏色變化,從而進一步擴大數(shù)據(jù)集來緩解過擬合凯正。
核心代碼
class AlexNet(nn.Module):
def __init__(self):
super(AlexNet, self).__init__()
self.conv = nn.Sequential(
nn.Conv2d(1, 96, 11, 4), # in_channels, out_channels, kernel_size, stride, padding
nn.ReLU(),
nn.MaxPool2d(3, 2), # kernel_size, stride
# 減小卷積窗口毙玻,使用填充為2來使得輸入與輸出的高和寬一致,且增大輸出通道數(shù)
nn.Conv2d(96, 256, 5, 1, 2),
nn.ReLU(),
nn.MaxPool2d(3, 2),
# 連續(xù)3個卷積層廊散,且使用更小的卷積窗口桑滩。除了最后的卷積層外,進一步增大了輸出通道數(shù)允睹。
# 前兩個卷積層后不使用池化層來減小輸入的高和寬
nn.Conv2d(256, 384, 3, 1, 1),
nn.ReLU(),
nn.Conv2d(384, 384, 3, 1, 1),
nn.ReLU(),
nn.Conv2d(384, 256, 3, 1, 1),
nn.ReLU(),
nn.MaxPool2d(3, 2)
)
# 這里全連接層的輸出個數(shù)比LeNet中的大數(shù)倍运准。使用丟棄層來緩解過擬合
self.fc = nn.Sequential(
nn.Linear(256*5*5, 4096),
nn.ReLU(),
nn.Dropout(0.5),
#由于使用CPU鏡像,精簡網(wǎng)絡缭受,若為GPU鏡像可添加該層
#nn.Linear(4096, 4096),
#nn.ReLU(),
#nn.Dropout(0.5),
# 輸出層胁澳。由于這里使用Fashion-MNIST,所以用類別數(shù)為10米者,而非論文中的1000
nn.Linear(4096, 10),
)
def forward(self, img):
feature = self.conv(img)
output = self.fc(feature.view(img.shape[0], -1))
return output
net = AlexNet()
print(net)
AlexNet(
(conv): Sequential(
(0): Conv2d(1, 96, kernel_size=(11, 11), stride=(4, 4))
(1): ReLU()
(2): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
(3): Conv2d(96, 256, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
(4): ReLU()
(5): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
(6): Conv2d(256, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(7): ReLU()
(8): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(9): ReLU()
(10): Conv2d(384, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(11): ReLU()
(12): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
)
(fc): Sequential(
(0): Linear(in_features=6400, out_features=4096, bias=True)
(1): ReLU()
(2): Dropout(p=0.5, inplace=False)
(3): Linear(in_features=4096, out_features=10, bias=True)
)
)
訓練
def load_data_fashion_mnist(batch_size, resize=None, root='/home/kesci/input/FashionMNIST2065'):
"""Download the fashion mnist dataset and then load into memory."""
trans = []
if resize:
trans.append(torchvision.transforms.Resize(size=resize))
trans.append(torchvision.transforms.ToTensor())
transform = torchvision.transforms.Compose(trans)
mnist_train = torchvision.datasets.FashionMNIST(root=root, train=True, download=True, transform=transform)
mnist_test = torchvision.datasets.FashionMNIST(root=root, train=False, download=True, transform=transform)
train_iter = torch.utils.data.DataLoader(mnist_train, batch_size=batch_size, shuffle=True, num_workers=2)
test_iter = torch.utils.data.DataLoader(mnist_test, batch_size=batch_size, shuffle=False, num_workers=2)
return train_iter, test_iter
batch_size = 16
# 如出現(xiàn)“out of memory”的報錯信息韭畸,可減小batch_size或resize
train_iter, test_iter = load_data_fashion_mnist(batch_size,224)
for X, Y in train_iter:
print('X =', X.shape,
'\nY =', Y.type(torch.int32))
break
lr, num_epochs = 0.001, 3
optimizer = torch.optim.Adam(net.parameters(), lr=lr)
d2l.train_ch5(net, train_iter, test_iter, batch_size, optimizer, device, num_epochs)
使用重復元素的網(wǎng)絡(VGG)
VGG:通過來構建深度模型。
Block:數(shù)個相同的填充為1塘雳、窗口形狀為的卷積層,接上一個步幅為2陆盘、窗口形狀為的最大池化層。
卷積層保持輸入的高和寬不變败明,而池化層則對其減半隘马。
VGG_Net block
def vgg_block(num_convs, in_channels, out_channels): #卷積層個數(shù),輸入通道數(shù)妻顶,輸出通道數(shù)
blk = []
for i in range(num_convs):
if i == 0:
blk.append(nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1))
else:
blk.append(nn.Conv2d(out_channels, out_channels, kernel_size=3, padding=1))
blk.append(nn.ReLU())
blk.append(nn.MaxPool2d(kernel_size=2, stride=2)) # 這里會使寬高減半
return nn.Sequential(*blk)
VGG整體構建
conv_arch = ((1, 1, 64), (1, 64, 128), (2, 128, 256), (2, 256, 512), (2, 512, 512))
# 經(jīng)過5個vgg_block, 寬高會減半5次, 變成 224/32 = 7
fc_features = 512 * 7 * 7 # c * w * h
fc_hidden_units = 4096 # 任意
def vgg(conv_arch, fc_features, fc_hidden_units=4096):
net = nn.Sequential()
# 卷積層部分
for i, (num_convs, in_channels, out_channels) in enumerate(conv_arch):
# 每經(jīng)過一個vgg_block都會使寬高減半
net.add_module("vgg_block_" + str(i+1), vgg_block(num_convs, in_channels, out_channels))
# 全連接層部分
net.add_module("fc", nn.Sequential(d2l.FlattenLayer(),
nn.Linear(fc_features, fc_hidden_units),
nn.ReLU(),
nn.Dropout(0.5),
nn.Linear(fc_hidden_units, fc_hidden_units),
nn.ReLU(),
nn.Dropout(0.5),
nn.Linear(fc_hidden_units, 10)
))
return net
net = vgg(conv_arch, fc_features, fc_hidden_units)
X = torch.rand(1, 1, 224, 224)
# named_children獲取一級子模塊及其名字(named_modules會返回所有子模塊,包括子模塊的子模塊)
for name, blk in net.named_children():
X = blk(X)
print(name, 'output shape: ', X.shape)
ratio = 8
small_conv_arch = [(1, 1, 64//ratio), (1, 64//ratio, 128//ratio), (2, 128//ratio, 256//ratio),
(2, 256//ratio, 512//ratio), (2, 512//ratio, 512//ratio)]
net = vgg(small_conv_arch, fc_features // ratio, fc_hidden_units // ratio)
print(net)
batchsize=16
#batch_size = 64
# 如出現(xiàn)“out of memory”的報錯信息酸员,可減小batch_size或resize
# train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size, resize=224)
lr, num_epochs = 0.001, 5
optimizer = torch.optim.Adam(net.parameters(), lr=lr)
d2l.train_ch5(net, train_iter, test_iter, batch_size, optimizer, device, num_epochs)
?絡中的?絡(NiN)
LeNet、AlexNet和VGG:先以由卷積層構成的模塊充分抽取 空間特征讳嘱,再以由全連接層構成的模塊來輸出分類結果幔嗦。
NiN:串聯(lián)多個由卷積層和“全連接”層構成的小?絡來構建?個深層?絡。
?了輸出通道數(shù)等于標簽類別數(shù)的NiN塊沥潭,然后使?全局平均池化層對每個通道中所有元素求平均并直接?于分類邀泉。
1×1卷積核作用:
1.放縮通道數(shù):通過控制卷積核的數(shù)量達到通道數(shù)的放縮。
2.增加非線性钝鸽。1×1卷積核的卷積過程相當于全連接層的計算過程汇恤,并且還加入了非線性激活函數(shù),從而可以增加網(wǎng)絡的非線性拔恰。
3.計算參數(shù)少
Block:
def nin_block(in_channels, out_channels, kernel_size, stride, padding):
blk = nn.Sequential(nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding),
nn.ReLU(),
nn.Conv2d(out_channels, out_channels, kernel_size=1),
nn.ReLU(),
nn.Conv2d(out_channels, out_channels, kernel_size=1),
nn.ReLU())
return blk
NiN:
class GlobalAvgPool2d(nn.Module):
# 全局平均池化層可通過將池化窗口形狀設置成輸入的高和寬實現(xiàn)
def __init__(self):
super(GlobalAvgPool2d, self).__init__()
def forward(self, x):
return F.avg_pool2d(x, kernel_size=x.size()[2:])
net = nn.Sequential(
nin_block(1, 96, kernel_size=11, stride=4, padding=0),
nn.MaxPool2d(kernel_size=3, stride=2),
nin_block(96, 256, kernel_size=5, stride=1, padding=2),
nn.MaxPool2d(kernel_size=3, stride=2),
nin_block(256, 384, kernel_size=3, stride=1, padding=1),
nn.MaxPool2d(kernel_size=3, stride=2),
nn.Dropout(0.5),
# 標簽類別數(shù)是10
nin_block(384, 10, kernel_size=3, stride=1, padding=1),
GlobalAvgPool2d(),
# 將四維的輸出轉成二維的輸出因谎,其形狀為(批量大小, 10)
d2l.FlattenLayer())
注:
NiN重復使?由卷積層和代替全連接層的1×1卷積層構成的NiN塊來構建深層?絡状囱。
NiN去除了容易造成過擬合的全連接輸出層蛇受,而是將其替換成輸出通道數(shù)等于標簽類別數(shù) 的NiN塊和全局平均池化層。
NiN的以上設計思想影響了后??系列卷積神經(jīng)?絡的設計龟虎。
GoogLeNet
- 由Inception基礎塊組成。
- Inception塊相當于?個有4條線路的??絡匠璧。它通過不同窗口形狀的卷積層和最?池化層來并?抽取信息桐款,并使?1×1卷積層減少通道數(shù)從而降低模型復雜度。
- 可以?定義的超參數(shù)是每個層的輸出通道數(shù)患朱,我們以此來控制模型復雜度鲁僚。
Inception基礎塊
class Inception(nn.Module):
# c1 - c4為每條線路里的層的輸出通道數(shù)
def __init__(self, in_c, c1, c2, c3, c4):
super(Inception, self).__init__()
# 線路1,單1 x 1卷積層
self.p1_1 = nn.Conv2d(in_c, c1, kernel_size=1)
# 線路2裁厅,1 x 1卷積層后接3 x 3卷積層
self.p2_1 = nn.Conv2d(in_c, c2[0], kernel_size=1)
self.p2_2 = nn.Conv2d(c2[0], c2[1], kernel_size=3, padding=1)
# 線路3冰沙,1 x 1卷積層后接5 x 5卷積層
self.p3_1 = nn.Conv2d(in_c, c3[0], kernel_size=1)
self.p3_2 = nn.Conv2d(c3[0], c3[1], kernel_size=5, padding=2)
# 線路4,3 x 3最大池化層后接1 x 1卷積層
self.p4_1 = nn.MaxPool2d(kernel_size=3, stride=1, padding=1)
self.p4_2 = nn.Conv2d(in_c, c4, kernel_size=1)
def forward(self, x):
p1 = F.relu(self.p1_1(x))
p2 = F.relu(self.p2_2(F.relu(self.p2_1(x))))
p3 = F.relu(self.p3_2(F.relu(self.p3_1(x))))
p4 = F.relu(self.p4_2(self.p4_1(x)))
return torch.cat((p1, p2, p3, p4), dim=1) # 在通道維上連結輸出
GoogLeNet模型
完整模型結構
b1 = nn.Sequential(nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3),
nn.ReLU(),
nn.MaxPool2d(kernel_size=3, stride=2, padding=1))
b2 = nn.Sequential(nn.Conv2d(64, 64, kernel_size=1),
nn.Conv2d(64, 192, kernel_size=3, padding=1),
nn.MaxPool2d(kernel_size=3, stride=2, padding=1))
b3 = nn.Sequential(Inception(192, 64, (96, 128), (16, 32), 32),
Inception(256, 128, (128, 192), (32, 96), 64),
nn.MaxPool2d(kernel_size=3, stride=2, padding=1))
b4 = nn.Sequential(Inception(480, 192, (96, 208), (16, 48), 64),
Inception(512, 160, (112, 224), (24, 64), 64),
Inception(512, 128, (128, 256), (24, 64), 64),
Inception(512, 112, (144, 288), (32, 64), 64),
Inception(528, 256, (160, 320), (32, 128), 128),
nn.MaxPool2d(kernel_size=3, stride=2, padding=1))
b5 = nn.Sequential(Inception(832, 256, (160, 320), (32, 128), 128),
Inception(832, 384, (192, 384), (48, 128), 128),
d2l.GlobalAvgPool2d())
net = nn.Sequential(b1, b2, b3, b4, b5,
d2l.FlattenLayer(), nn.Linear(1024, 10))
net = nn.Sequential(b1, b2, b3, b4, b5, d2l.FlattenLayer(), nn.Linear(1024, 10))
接下來我們引入ResNet->DenseNet
ResNet與之前的網(wǎng)絡不同之處执虹,多了批量歸一化這一步
批量歸一化(BatchNormalization)
對輸入的標準化(淺層模型)
處理后的任意一個特征在數(shù)據(jù)集中所有樣本上的均值為0拓挥、標準差為1。
標準化處理輸入數(shù)據(jù)使各個特征的分布相近
批量歸一化(深度模型)
利用小批量上的均值和標準差袋励,不斷調(diào)整神經(jīng)網(wǎng)絡中間輸出侥啤,從而使整個神經(jīng)網(wǎng)絡在各層的中間輸出的數(shù)值更穩(wěn)定。
1.對全連接層做批量歸一化
位置:全連接層中的仿射變換和激活函數(shù)之間茬故。
全連接:
批量歸一化:
這?? > 0是個很小的常數(shù),保證分母大于0
引入可學習參數(shù):拉伸參數(shù)γ和偏移參數(shù)β磺芭。若和赁炎,批量歸一化無效。
2.對卷積層做批量歸?化
位置:卷積計算之后钾腺、應?激活函數(shù)之前徙垫。
如果卷積計算輸出多個通道,我們需要對這些通道的輸出分別做批量歸一化放棒,且每個通道都擁有獨立的拉伸和偏移參數(shù)姻报。
計算:對單通道,batchsize=m,卷積計算輸出=pxq
對該通道中m×p×q個元素同時做批量歸一化,使用相同的均值和方差间螟。
3.預測時的批量歸?化
訓練:以batch為單位,對每個batch計算均值和方差吴旋。
預測:用移動平均估算整個訓練數(shù)據(jù)集的樣本均值和方差。
從零實現(xiàn)
def batch_norm(is_training, X, gamma, beta, moving_mean, moving_var, eps, momentum):
# 判斷當前模式是訓練模式還是預測模式
if not is_training:
# 如果是在預測模式下厢破,直接使用傳入的移動平均所得的均值和方差
X_hat = (X - moving_mean) / torch.sqrt(moving_var + eps)
else:
assert len(X.shape) in (2, 4)
if len(X.shape) == 2:
# 使用全連接層的情況邮府,計算特征維上的均值和方差
mean = X.mean(dim=0)
var = ((X - mean) ** 2).mean(dim=0)
else:
# 使用二維卷積層的情況,計算通道維上(axis=1)的均值和方差溉奕。這里我們需要保持
# X的形狀以便后面可以做廣播運算
mean = X.mean(dim=0, keepdim=True).mean(dim=2, keepdim=True).mean(dim=3, keepdim=True)
var = ((X - mean) ** 2).mean(dim=0, keepdim=True).mean(dim=2, keepdim=True).mean(dim=3, keepdim=True)
# 訓練模式下用當前的均值和方差做標準化
X_hat = (X - mean) / torch.sqrt(var + eps)
# 更新移動平均的均值和方差
moving_mean = momentum * moving_mean + (1.0 - momentum) * mean
moving_var = momentum * moving_var + (1.0 - momentum) * var
Y = gamma * X_hat + beta # 拉伸和偏移
return Y, moving_mean, moving_var
class BatchNorm(nn.Module):
def __init__(self, num_features, num_dims):
super(BatchNorm, self).__init__()
if num_dims == 2:
shape = (1, num_features) #全連接層輸出神經(jīng)元
else:
shape = (1, num_features, 1, 1) #通道數(shù)
# 參與求梯度和迭代的拉伸和偏移參數(shù),分別初始化成0和1
self.gamma = nn.Parameter(torch.ones(shape))
self.beta = nn.Parameter(torch.zeros(shape))
# 不參與求梯度和迭代的變量忍啤,全在內(nèi)存上初始化成0
self.moving_mean = torch.zeros(shape)
self.moving_var = torch.zeros(shape)
def forward(self, X):
# 如果X不在內(nèi)存上加勤,將moving_mean和moving_var復制到X所在顯存上
if self.moving_mean.device != X.device:
self.moving_mean = self.moving_mean.to(X.device)
self.moving_var = self.moving_var.to(X.device)
# 保存更新過的moving_mean和moving_var, Module實例的traning屬性默認為true, 調(diào)用.eval()后設成false
Y, self.moving_mean, self.moving_var = batch_norm(self.training,
X, self.gamma, self.beta, self.moving_mean,
self.moving_var, eps=1e-5, momentum=0.9)
return Y
基于LeNet應用
net = nn.Sequential(
nn.Conv2d(1, 6, 5), # in_channels, out_channels, kernel_size
BatchNorm(6, num_dims=4),
nn.Sigmoid(),
nn.MaxPool2d(2, 2), # kernel_size, stride
nn.Conv2d(6, 16, 5),
BatchNorm(16, num_dims=4),
nn.Sigmoid(),
nn.MaxPool2d(2, 2),
d2l.FlattenLayer(),
nn.Linear(16*4*4, 120),
BatchNorm(120, num_dims=2),
nn.Sigmoid(),
nn.Linear(120, 84),
BatchNorm(84, num_dims=2),
nn.Sigmoid(),
nn.Linear(84, 10)
)
print(net)
Sequential(
(0): Conv2d(1, 6, kernel_size=(5, 5), stride=(1, 1))
(1): BatchNorm()
(2): Sigmoid()
(3): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
(4): Conv2d(6, 16, kernel_size=(5, 5), stride=(1, 1))
(5): BatchNorm()
(6): Sigmoid()
(7): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
(8): FlattenLayer()
(9): Linear(in_features=256, out_features=120, bias=True)
(10): BatchNorm()
(11): Sigmoid()
(12): Linear(in_features=120, out_features=84, bias=True)
(13): BatchNorm()
(14): Sigmoid()
(15): Linear(in_features=84, out_features=10, bias=True)
)
殘差網(wǎng)絡(ResNet)
深度學習的問題:深度CNN網(wǎng)絡達到一定深度后再一味地增加層數(shù)并不能帶來進一步地分類性能提高仙辟,反而會招致網(wǎng)絡收斂變得更慢,準確率也變得更差鳄梅。
殘差塊(Residual Block)
恒等映射:
左邊:f(x)=x
右邊:f(x)-x=0 (易于捕捉恒等映射的細微波動)
在殘差塊中叠国,輸?可通過跨層的數(shù)據(jù)線路更快 地向前傳播。
class Residual(nn.Module):
#可以設定輸出通道數(shù)戴尸、是否使用額外的1x1卷積層來修改通道數(shù)以及卷積層的步幅粟焊。
def __init__(self, in_channels, out_channels, use_1x1conv=False, stride=1):
super(Residual, self).__init__()
self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1, stride=stride)
self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, padding=1)
if use_1x1conv:
self.conv3 = nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=stride)
else:
self.conv3 = None
self.bn1 = nn.BatchNorm2d(out_channels)
self.bn2 = nn.BatchNorm2d(out_channels)
def forward(self, X):
Y = F.relu(self.bn1(self.conv1(X)))
Y = self.bn2(self.conv2(Y))
if self.conv3:
X = self.conv3(X)
return F.relu(Y + X)
blk = Residual(3, 3)
X = torch.rand((4, 3, 6, 6))
blk(X).shape # torch.Size([4, 3, 6, 6])
blk = Residual(3, 6, use_1x1conv=True, stride=2)
blk(X).shape # torch.Size([4, 6, 3, 3])
我們新建一個ResNet模型
ResNet模型
卷積(64,7x7,3)
批量一體化
最大池化(3x3,2)
殘差塊x4 (通過步幅為2的殘差塊在每個模塊之間減小高和寬)
全局平均池化
全連接
net = nn.Sequential(
nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3),
nn.BatchNorm2d(64),
nn.ReLU(),
nn.MaxPool2d(kernel_size=3, stride=2, padding=1))
def resnet_block(in_channels, out_channels, num_residuals, first_block=False):
if first_block:
assert in_channels == out_channels # 第一個模塊的通道數(shù)同輸入通道數(shù)一致
blk = []
for i in range(num_residuals):
if i == 0 and not first_block:
blk.append(Residual(in_channels, out_channels, use_1x1conv=True, stride=2))
else:
blk.append(Residual(out_channels, out_channels))
return nn.Sequential(*blk)
net.add_module("resnet_block1", resnet_block(64, 64, 2, first_block=True))
net.add_module("resnet_block2", resnet_block(64, 128, 2))
net.add_module("resnet_block3", resnet_block(128, 256, 2))
net.add_module("resnet_block4", resnet_block(256, 512, 2))
net.add_module("global_avg_pool", d2l.GlobalAvgPool2d()) # GlobalAvgPool2d的輸出: (Batch, 512, 1, 1)
net.add_module("fc", nn.Sequential(d2l.FlattenLayer(), nn.Linear(512, 10)))
X = torch.rand((1, 1, 224, 224))
for name, layer in net.named_children():
X = layer(X)
print(name, ' output shape:\t', X.shape)
0 output shape: torch.Size([1, 64, 112, 112])
1 output shape: torch.Size([1, 64, 112, 112])
2 output shape: torch.Size([1, 64, 112, 112])
3 output shape: torch.Size([1, 64, 56, 56])
resnet_block1 output shape: torch.Size([1, 64, 56, 56])
resnet_block2 output shape: torch.Size([1, 128, 28, 28])
resnet_block3 output shape: torch.Size([1, 256, 14, 14])
resnet_block4 output shape: torch.Size([1, 512, 7, 7])
global_avg_pool output shape: torch.Size([1, 512, 1, 1])
fc output shape: torch.Size([1, 10])
同樣,我們設置參數(shù)損失率孙蒙,迭代次數(shù)
lr, num_epochs = 0.001, 5
optimizer = torch.optim.Adam(net.parameters(), lr=lr)
d2l.train_ch5(net, train_iter, test_iter, batch_size, optimizer, device, num_epochs)
稠密連接網(wǎng)絡(DenseNet)
主要構建模塊:
稠密塊(dense block): 定義了輸入和輸出是如何連結的项棠。
過渡層(transition layer):用來控制通道數(shù),使之不過大挎峦。
稠密塊
def conv_block(in_channels, out_channels):
blk = nn.Sequential(nn.BatchNorm2d(in_channels),
nn.ReLU(),
nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1))
return blk
class DenseBlock(nn.Module):
def __init__(self, num_convs, in_channels, out_channels):
super(DenseBlock, self).__init__()
net = []
for i in range(num_convs):
in_c = in_channels + i * out_channels
net.append(conv_block(in_c, out_channels))
self.net = nn.ModuleList(net)
self.out_channels = in_channels + num_convs * out_channels # 計算輸出通道數(shù)
def forward(self, X):
for blk in self.net:
Y = blk(X)
X = torch.cat((X, Y), dim=1) # 在通道維上將輸入和輸出連結
return X
測試一下:
blk = DenseBlock(2, 3, 10)
X = torch.rand(4, 3, 8, 8)
Y = blk(X)
Y.shape # torch.Size([4, 23, 8, 8])
torch.Size([4, 23, 8, 8])
過渡層
卷積層:來減小通道數(shù)
步幅為2的平均池化層:減半高和寬
def transition_block(in_channels, out_channels):
blk = nn.Sequential(
nn.BatchNorm2d(in_channels),
nn.ReLU(),
nn.Conv2d(in_channels, out_channels, kernel_size=1),
nn.AvgPool2d(kernel_size=2, stride=2))
return blk
blk = transition_block(23, 10)
blk(Y).shape # torch.Size([4, 10, 4, 4])
DenseNet模型
net = nn.Sequential(
nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3),
nn.BatchNorm2d(64),
nn.ReLU(),
nn.MaxPool2d(kernel_size=3, stride=2, padding=1))
num_channels, growth_rate = 64, 32 # num_channels為當前的通道數(shù)
num_convs_in_dense_blocks = [4, 4, 4, 4]
for i, num_convs in enumerate(num_convs_in_dense_blocks):
DB = DenseBlock(num_convs, num_channels, growth_rate)
net.add_module("DenseBlosk_%d" % i, DB)
# 上一個稠密塊的輸出通道數(shù)
num_channels = DB.out_channels
# 在稠密塊之間加入通道數(shù)減半的過渡層
if i != len(num_convs_in_dense_blocks) - 1:
net.add_module("transition_block_%d" % i, transition_block(num_channels, num_channels // 2))
num_channels = num_channels // 2
net.add_module("BN", nn.BatchNorm2d(num_channels))
net.add_module("relu", nn.ReLU())
net.add_module("global_avg_pool", d2l.GlobalAvgPool2d()) # GlobalAvgPool2d的輸出: (Batch, num_channels, 1, 1)
net.add_module("fc", nn.Sequential(d2l.FlattenLayer(), nn.Linear(num_channels, 10)))
X = torch.rand((1, 1, 96, 96))
for name, layer in net.named_children():
X = layer(X)
print(name, ' output shape:\t', X.shape)