實(shí)驗(yàn)準(zhǔn)備
基礎(chǔ)網(wǎng)絡(luò)搭建
為了實(shí)現(xiàn)神經(jīng)網(wǎng)絡(luò)的deep compression,首先要訓(xùn)練一個深度神經(jīng)網(wǎng)絡(luò),為了方便實(shí)現(xiàn)契沫,這里實(shí)現(xiàn)一個兩層卷積+兩層MLP的神經(jīng)網(wǎng)絡(luò)
class net(pt.nn.Module):
def __init__(self):
super(net,self).__init__()
self.conv1 = pt.nn.Conv2d(in_channels=1,out_channels=64,kernel_size=3,padding=1)
self.conv2 = pt.nn.Conv2d(in_channels=64,out_channels=256,kernel_size=3,padding=1)
self.fc1 = pt.nn.Linear(in_features=7*7*256,out_features=512)
self.fc2 = pt.nn.Linear(in_features=512,out_features=10)
self.pool = pt.nn.MaxPool2d(2)
def forward(self,x):
x = self.pool(pt.nn.functional.relu(self.conv1(x)))
x = self.pool(pt.nn.functional.relu(self.conv2(x)))
x = pt.nn.functional.relu(self.fc1(x.view((-1,7*7*256))))
return self.fc2(x)
model = net().cuda()
print(model)
print(model(pt.rand(1,1,28,28).cuda()))
net(
(conv1): Conv2d(1, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(conv2): Conv2d(64, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(fc1): Linear(in_features=12544, out_features=512, bias=True)
(fc2): Linear(in_features=512, out_features=10, bias=True)
(pool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
)
tensor(1.00000e-02 *
[[-7.7157, 3.0435, -6.5732, 6.5343, -4.2159, -2.8651, -0.6792,
3.9223, -3.7523, 2.4532]], device='cuda:0')
基礎(chǔ)網(wǎng)絡(luò)訓(xùn)練
準(zhǔn)備數(shù)據(jù)集
train_dataset = ptv.datasets.MNIST("./",download=True,transform=ptv.transforms.ToTensor())
test_dataset = ptv.datasets.MNIST("./",train=False,transform=ptv.transforms.ToTensor())
trainloader = pt.utils.data.DataLoader(train_dataset,shuffle=True,batch_size=128)
testloader = pt.utils.data.DataLoader(test_dataset,shuffle=True,batch_size=128)
代價函數(shù)與優(yōu)化器
lossfunc = pt.nn.CrossEntropyLoss().cuda()
optimizer = pt.optim.Adam(model.parameters(),1e-4)
def acc(outputs,label):
_,data = pt.max(outputs,dim=1)
return pt.mean((data.float()==label.float()).float()).item()
網(wǎng)絡(luò)訓(xùn)練
for _ in range(1):
for i,(data,label) in enumerate(trainloader):
data,label = data.cuda(),label.cuda()
model.zero_grad()
outputs = model(data)
loss = lossfunc(outputs,label)
loss.backward()
optimizer.step()
if i % 100 == 0:
print(i,acc(outputs,label))
0 0.1171875
100 0.8984375
200 0.953125
300 0.984375
400 0.96875
測試網(wǎng)絡(luò)
def test_model(model,testloader):
result = []
for data,label in testloader:
data,label = data.cuda(),label.cuda()
outputs = model(data)
result.append(acc(outputs,label))
result = sum(result) / len(result)
print(result)
return result
test_model(model,testloader)
0.96875
保存網(wǎng)絡(luò)
pt.save(model.state_dict(),"./base.ptb")
剪枝實(shí)驗(yàn)
剪枝是deep compression的第一步,含義是將部分較形艉骸(小于某個閾值)的權(quán)值置位為0懈万,表示這個連接被剪掉,且在之后的微調(diào)過程中靶病,這個連接的梯度也將被置位為0会通,即不參加訓(xùn)練
準(zhǔn)備相關(guān)工具
剪枝實(shí)驗(yàn)需要準(zhǔn)備一些函數(shù):剪枝函數(shù),梯度剪枝函數(shù)和稀疏度評估函數(shù)
剪枝函數(shù)
剪枝函數(shù)輸入模型和閾值娄周,將所有絕對值小于閾值的權(quán)值置位為0
def puring(model,threshold):
for i in model.parameters():
i.data[pt.abs(i) < threshold] = 0
return model
梯度剪枝函數(shù)
def grad_puring(model):
for i in model.parameters():
mask = i.clone()
mask[mask != 0] = 1
i.grad.data.mul_(mask)
稀疏度評估函數(shù)
def print_sparse(model):
result = []
total_num = 0
total_sparse = 0
print("-----------------------------------")
print("Layer sparse")
for name,f in model.named_parameters():
num = f.view(-1).shape[0]
total_num += num
sparse = pt.nonzero(f).shape[0]
total_sparse+= sparse
print("\t",name,(sparse)/num)
result.append((sparse)/num)
total = total_sparse/total_num
print("Total:",total)
return total
剪枝
首先涕侈,查看原有網(wǎng)絡(luò)的稀疏度情況
model = net().cuda()
model.load_state_dict(pt.load("./base.ptb"))
_ = test_model(model,testloader)
0.96875
print_sparse(model)
-----------------------------------
Layer sparse
conv1.weight 1.0
conv1.bias 1.0
conv2.weight 1.0
conv2.bias 1.0
fc1.weight 1.0
fc1.bias 1.0
fc2.weight 1.0
fc2.bias 1.0
Total: 1.0
可以發(fā)現(xiàn),原有網(wǎng)絡(luò)完全沒有稀疏性煤辨,現(xiàn)在進(jìn)行剪枝裳涛,使用閾值為0.01進(jìn)行剪枝,小于0.01的連接將被剪掉众辨。根據(jù)結(jié)果可以發(fā)現(xiàn)端三,在閾值0.01下,剪枝后僅剩8.3%參數(shù)泻轰,且準(zhǔn)確率不受影響
model1 = puring(model,0.01)
test_model(model1,testloader)
print_sparse(model1)
0.9706289556962026
-----------------------------------
Layer sparse
conv1.weight 0.9739583333333334
conv1.bias 0.90625
conv2.weight 0.7641262478298612
conv2.bias 0.71875
fc1.weight 0.06729390669842156
fc1.bias 0.025390625
fc2.weight 0.7837890625
fc2.bias 0.9
Total: 0.08358673475128647
0.08358673475128647
現(xiàn)在調(diào)整閾值為0.1技肩,準(zhǔn)確率大幅度下降,現(xiàn)在僅剩很少的參數(shù)
model.load_state_dict(pt.load("./base.ptb"))
model2 = puring(model,0.1)
test_model(model2,testloader)
print_sparse(model2)
0.09760680379746836
-----------------------------------
Layer sparse
conv1.weight 0.671875
conv1.bias 0.6875
conv2.weight 0.0
conv2.bias 0.0
fc1.weight 0.0
fc1.bias 0.0
fc2.weight 0.0
fc2.bias 0.0
Total: 6.553616029871108e-05
6.553616029871108e-05
現(xiàn)在進(jìn)行閾值的格點(diǎn)掃描浮声,掃描的范圍從0.1到0.01虚婿,步長為0.01
sparse_list = []
threshold_list = [x*0.01+0.01 for x in range(10)]
acc_list = []
for i in threshold_list:
model.load_state_dict(pt.load("./base.ptb"))
model3 = puring(model,i)
acc_list.append(test_model(model3,testloader))
sparse_list.append(print_sparse(model3))
threshold_list.append
0.9706289556962026
-----------------------------------
Layer sparse
conv1.weight 0.9739583333333334
conv1.bias 0.90625
conv2.weight 0.7641262478298612
conv2.bias 0.71875
fc1.weight 0.06729390669842156
fc1.bias 0.025390625
fc2.weight 0.7837890625
fc2.bias 0.9
Total: 0.08358673475128647
0.47735363924050633
-----------------------------------
Layer sparse
conv1.weight 0.9375
conv1.bias 0.890625
conv2.weight 0.5333726671006944
conv2.bias 0.4765625
fc1.weight 0.0015011222995057398
fc1.bias 0.0
fc2.weight 0.5765625
fc2.bias 0.7
Total: 0.01398429139292775
0.09513449367088607
-----------------------------------
Layer sparse
conv1.weight 0.9045138888888888
conv1.bias 0.890625
conv2.weight 0.3156263563368056
conv2.bias 0.2578125
fc1.weight 1.5414490991709182e-05
fc1.bias 0.0
fc2.weight 0.371875
fc2.bias 0.4
Total: 0.007479941525322959
0.09612341772151899
-----------------------------------
Layer sparse
conv1.weight 0.8732638888888888
conv1.bias 0.875
conv2.weight 0.13545735677083334
conv2.bias 0.0546875
fc1.weight 0.0
fc1.bias 0.0
fc2.weight 0.1615234375
fc2.bias 0.1
Total: 0.003250198205069488
0.09691455696202532
-----------------------------------
Layer sparse
conv1.weight 0.8402777777777778
conv1.bias 0.84375
conv2.weight 0.03839111328125
conv2.bias 0.00390625
fc1.weight 0.0
fc1.bias 0.0
fc2.weight 0.016796875
fc2.bias 0.0
Total: 0.0009558243703890901
0.1003757911392405
-----------------------------------
Layer sparse
conv1.weight 0.8142361111111112
conv1.bias 0.796875
conv2.weight 0.0084228515625
conv2.bias 0.0
fc1.weight 0.0
fc1.bias 0.0
fc2.weight 0.0
fc2.bias 0.0
Total: 0.00026792277133719006
0.09760680379746836
-----------------------------------
Layer sparse
conv1.weight 0.7760416666666666
conv1.bias 0.765625
conv2.weight 0.0014580620659722222
conv2.bias 0.0
fc1.weight 0.0
fc1.bias 0.0
fc2.weight 0.0
fc2.bias 0.0
Total: 0.00010811185608441666
0.09760680379746836
-----------------------------------
Layer sparse
conv1.weight 0.7447916666666666
conv1.bias 0.734375
conv2.weight 0.00014241536458333334
conv2.bias 0.0
fc1.weight 0.0
fc1.bias 0.0
fc2.weight 0.0
fc2.bias 0.0
Total: 7.55718600196274e-05
0.09968354430379747
-----------------------------------
Layer sparse
conv1.weight 0.7065972222222222
conv1.bias 0.71875
conv2.weight 0.0
conv2.bias 0.0
fc1.weight 0.0
fc1.bias 0.0
fc2.weight 0.0
fc2.bias 0.0
Total: 6.888139353901653e-05
0.09760680379746836
-----------------------------------
Layer sparse
conv1.weight 0.671875
conv1.bias 0.6875
conv2.weight 0.0
conv2.bias 0.0
fc1.weight 0.0
fc1.bias 0.0
fc2.weight 0.0
fc2.bias 0.0
Total: 6.553616029871108e-05
import matplotlib.pyplot as plt
plt.figure(figsize=(10,3))
plt.subplot(131)
plt.plot(threshold_list,acc_list)
plt.subplot(132)
plt.plot(threshold_list,acc_list)
plt.subplot(133)
plt.plot(sparse_list,acc_list)
plt.show()
上圖自左向右分別是閾值-準(zhǔn)確率,閾值-稀疏度和稀疏度-準(zhǔn)確率關(guān)系
剪枝后微調(diào)
我們發(fā)現(xiàn)泳挥,閾值為大約0.02時然痊,準(zhǔn)確率僅為47%左右,考慮使用微調(diào)閾值的方式進(jìn)行調(diào)整
model = net().cuda()
model.load_state_dict(pt.load("./base.ptb"))
model1 = puring(model,0.02)
test_model(model1,testloader)
print_sparse(model1)
0.4759691455696203
-----------------------------------
Layer sparse
conv1.weight 0.9375
conv1.bias 0.890625
conv2.weight 0.5333726671006944
conv2.bias 0.4765625
fc1.weight 0.0015011222995057398
fc1.bias 0.0
fc2.weight 0.5765625
fc2.bias 0.7
Total: 0.01398429139292775
optimizer = pt.optim.Adam(model1.parameters(),1e-5)
lossfunc = pt.nn.CrossEntropyLoss().cuda()
for _ in range(4):
for i,(data,label) in enumerate(trainloader):
data,label = data.cuda(),label.cuda()
outputs = model1(data)
loss = lossfunc(outputs,label)
loss.backward()
grad_puring(model1)
optimizer.step()
if i % 100 == 0:
print(i,acc(outputs,label))
0 0.4375
100 0.4375
200 0.5625
300 0.6015625
400 0.6875
0 0.7265625
100 0.6953125
200 0.7890625
300 0.8046875
400 0.7734375
0 0.8125
100 0.8046875
200 0.890625
300 0.8515625
400 0.875
0 0.859375
100 0.8515625
200 0.9140625
300 0.890625
400 0.9296875
test_model(model1,testloader)
print_sparse(model1)
pt.save(model1.state_dict(),'./puring.pt')
0.9367088607594937
-----------------------------------
Layer sparse
conv1.weight 0.9375
conv1.bias 0.890625
conv2.weight 0.5333726671006944
conv2.bias 0.4765625
fc1.weight 0.0015011222995057398
fc1.bias 0.0
fc2.weight 0.5765625
fc2.bias 0.7
Total: 0.01398429139292775
由上發(fā)現(xiàn)屉符,經(jīng)過權(quán)值微調(diào)后剧浸,在保持原有的稀疏度的情況下將準(zhǔn)確率提高到了90%以上
量化實(shí)驗(yàn)
量化過程比較復(fù)雜锹引,分為量化和微調(diào)兩個步驟,量化步驟使用sklearn的k-mean實(shí)現(xiàn)唆香,微調(diào)使用pytorch本身實(shí)現(xiàn)
量化
model = net().cuda()
model.load_state_dict(pt.load("./puring.pt"))
test_model(model,testloader)
0.9367088607594937
from sklearn.cluster import KMeans
import numpy as np
kmean_list = []
bit = 2
for name,i in model.named_parameters():
data = i.data.clone().view(-1).cpu().detach().numpy().reshape(-1)
data = data[data != 0]
if data.size < 2 ** bit:
kmean_list.append(None)
continue
init = [x*(np.max(data)+np.min(data))/(2 ** bit) + np.min(data) for x in range(2 ** bit)]
kmn = KMeans(2 ** bit,init=np.array(init).reshape(2 ** bit,1))
kmn.fit(data.reshape((-1,1)))
kmean_list.append(kmn)
print(name,i.shape)
conv1.weight torch.Size([64, 1, 3, 3])
conv1.bias torch.Size([64])
conv2.weight torch.Size([256, 64, 3, 3])
conv2.bias torch.Size([256])
fc1.weight torch.Size([512, 12544])
fc2.weight torch.Size([10, 512])
fc2.bias torch.Size([10])
c:\program files\python35\lib\site-packages\sklearn\cluster\k_means_.py:896: RuntimeWarning: Explicit initial center position passed: performing only one init in k-means instead of n_init=10
return_n_iter=True)
訓(xùn)練完量化器后嫌变,將每一層數(shù)據(jù)使用對應(yīng)的量化器進(jìn)行量化
for i,(name,f) in enumerate(model.named_parameters()):
data = f.data.clone().view(-1).cpu().detach().numpy().reshape(-1)
data_nozero = data[data != 0].reshape((-1,1))
if data_nozero.size == 0 or data.size < 2 ** bit or kmean_list[i] is None:
f.kmeans_result = None
f.kmeans_label = None
continue
# print(name)
# print(data.size)
result = data.copy()
result[result == 0] = -1
# print(data_nozero)
# print(kmean_list[i])
label = kmean_list[i].predict(data_nozero).reshape(-1)
# print(data_nozero)
# print(label)
new_data = np.array([kmean_list[i].cluster_centers_[x] for x in label])
data[data != 0] = new_data.reshape(-1)
# print(data,new_data)
f.data = pt.from_numpy(data).view(f.data.shape).cuda()
result[result != -1] = label
f.kmeans_result = pt.from_numpy(result).view(f.data.shape).cuda()
f.kmeans_label = pt.from_numpy(kmean_list[i].cluster_centers_).cuda()
test_model(model,testloader)
print_sparse(model)
0.8919106012658228
-----------------------------------
Layer sparse
conv1.weight 0.9375
conv1.bias 0.890625
conv2.weight 0.5333726671006944
conv2.bias 0.4765625
fc1.weight 0.0015011222995057398
fc1.bias 0.0
fc2.weight 0.5765625
fc2.bias 0.7
Total: 0.01398429139292775
0.01398429139292775
由上可以發(fā)現(xiàn),對于這種玩具級的網(wǎng)絡(luò)來說躬它,2bit量化已經(jīng)完全足夠了腾啥,精度損失3個百分點(diǎn)
微調(diào)
lossfunc = pt.nn.CrossEntropyLoss().cuda()
lr = 0.001
for _ in range(1):
for a,(data,label) in enumerate(trainloader):
data,label = data.cuda(),label.cuda()
model.zero_grad()
outputs = model(data)
loss = lossfunc(outputs,label)
loss.backward()
for name,i in model.named_parameters():
# print(i.data)
# break
if i.kmeans_result is None:
continue
for x in range(2 ** bit):
grad = pt.sum(i.grad.detach()[i.kmeans_result == x])
# print(grad.item())
i.kmeans_label[x] += -lr * grad.item()
i.data[i.kmeans_result == x] = i.kmeans_label[x].item()
# print(i.data)
# break
# print(name)
# test_model(model,testloader)
# break
if a % 100 == 0:
print(a,acc(outputs,label))
# break
# break
0 0.8828125
100 0.921875
200 0.9296875
300 0.9296875
400 0.9140625
test_model(model,testloader)
print_sparse(model)
pt.save(model.state_dict(),"quantization.pt")
0.9384889240506329
-----------------------------------
Layer sparse
conv1.weight 0.9375
conv1.bias 0.890625
conv2.weight 0.5333726671006944
conv2.bias 0.4765625
fc1.weight 0.0015011222995057398
fc1.bias 0.0
fc2.weight 0.5765625
fc2.bias 0.7
Total: 0.01398429139292775
通過對量化中心的微調(diào),2bit量化網(wǎng)絡(luò)的準(zhǔn)確率已經(jīng)與非量化網(wǎng)絡(luò)的準(zhǔn)確率相當(dāng)