1. Attention機(jī)制在機(jī)器翻譯中的第一次應(yīng)用
2. 寫(xiě)并訓(xùn)練模型的流程
import torch
import torch.nn as nn
import torch.nn.functional as F
import random
from torch.utils.data import TensorDataset, DataLoader
from sklearn.metrics import accuracy_score
import numpy as np
import time
learning_rate = 2e-5
weight_decay = 1e-2
epsilon = 1e-8
epoch_num = 5
gpu_id = 1
model_save_path = './'
# 自定義模型策橘,繼承nn.Module
class TModel(nn.Module):
def __init__(self):
super(TModel, self).__init__()
# 自定義網(wǎng)絡(luò)
self.ff = nn.Linear(100, 2)
self.relu = nn.ReLU()
# 自定義前向傳播
def forward(self, x):
x = F.relu(self.ff(x))
return x
def get_train_data():
# 初始化數(shù)據(jù)
torch.manual_seed(1)
x = np.random.randn(1000, 100)
# 這里就是向量焚刺,不是500*1的矩陣
y = np.concatenate((np.ones((500)), np.zeros((500))), axis=0)
# shuffle數(shù)據(jù)
shuffle_data = [(x[i], y[i]) for i in range(len(x))]
random.shuffle(shuffle_data)
# 將數(shù)據(jù)轉(zhuǎn)換為tensor
input_x = torch.Tensor([e[0] for e in shuffle_data])
input_y = torch.Tensor([e[1] for e in shuffle_data]).long() # CE的源碼中要求label的dtype就是long
# 劃分訓(xùn)練集和驗(yàn)證集
td_index = int(len(shuffle_data) * 0.9)
# 打包訓(xùn)練集和驗(yàn)證集
train_dataset = TensorDataset(input_x[: td_index], input_y[: td_index])
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
dev_dataset = TensorDataset(input_x[td_index: ], input_y[td_index: ])
dev_dataloader = DataLoader(dev_dataset, batch_size=32, shuffle=True)
return train_dataloader, dev_dataloader
def main():
# 初始化模型對(duì)象
model = TModel()
model.cuda(gpu_id)
# 定義loss函數(shù)
loss_fn = nn.CrossEntropyLoss()
# 定義優(yōu)化器,選擇梯度下降算法
# 法1. 用Adam優(yōu)化器
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
# 法2. 用AdamW優(yōu)化器的權(quán)重衰減防止過(guò)擬合蒋搜。針對(duì)transformers中的模型優(yōu)化
# no_decay = ['bias', 'LayerNorm.weight']
# optimizer_grouped_parameters = [
# {'params' : [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
# 'weight_decay' : weight_decay
# },
# {'params' : [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
# 'weight_decay' : 0.0
# }
# ]
# optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr = learning_rate, eps = epsilon)
# 準(zhǔn)備數(shù)據(jù)
train_dataloader, dev_dataloader = get_train_data()
max_dev_acc = 0.9
# 按epoch迭代
for epoch in range(epoch_num):
# 設(shè)置為訓(xùn)練模式
model.train()
# 初始化每個(gè)epoch的pred和groundtruth列表
train_pred, train_true = [], []
for x_i, y_i in train_dataloader:
# 將tensor放到cuda()上
x_i = x_i.cuda(gpu_id)
y_i = y_i.cuda(gpu_id)
output = model(x_i)
# print(output[:2])
# print(y_i[:2])
loss = loss_fn(output, y_i)
# train階段需要計(jì)算梯度嘉汰,反向傳播屎债,更新參數(shù)
optimizer.zero_grad()
loss.backward()
# 梯度裁剪放闺,防止梯度爆炸
nn.utils.clip_grad_norm_(model.parameters(), 1.0)
optimizer.step()
train_pred += torch.max(output, dim=1)[1].cpu().numpy().tolist()
train_true += y_i.cpu().numpy().tolist()
print("Epoch: %d: train acc[%.3f]" % (epoch, accuracy_score(train_pred, train_true)))
# 設(shè)置為驗(yàn)證模式
model.eval()
# 初始化每個(gè)epoch的pred和groundtruth列表
with torch.no_grad():
dev_pred, dev_true = [], []
for x_i, y_i in dev_dataloader:
# 將tensor放到cuda()上
x_i = x_i.cuda(gpu_id)
y_i = y_i.cuda(gpu_id)
output = model(x_i)
# loss = loss_fn(output, y_i)
dev_pred += torch.max(output, dim=1)[1].cpu().numpy().tolist()
dev_true += y_i.cpu().numpy().tolist()
dev_acc = accuracy_score(dev_pred, dev_true)
print("Epoch: %d: dev acc[%.3f]" % (epoch, dev_acc))
if dev_acc > max_dev_acc:
torch.save(model.state_dict(), model_save_path + 'epoch_%d_dev_acc_%.3f' % (epoch, dev_acc))
max_dev_acc = dev_acc
if __name__ == "__main__":
time_start=time.time()
main()
time_end=time.time()
print('totally cost',time_end-time_start)
DataLoader中的collate_fn參數(shù)
轉(zhuǎn)自 https://www.cnblogs.com/zf-blog/p/11360557.html
一般的观蜗,默認(rèn)的collate_fn函數(shù)是要求一個(gè)batch中的文本都具有相同size(因?yàn)橐鰏tack操作)。
當(dāng)一個(gè)batch中的文本大小都不同時(shí)逆甜,可以使用自定義的collate_fn函數(shù)虱肄,則一個(gè)batch中的圖片不再被stack操作,可以全部存儲(chǔ)在一個(gè)list中交煞,當(dāng)然還有對(duì)應(yīng)的label咏窿,如下面這個(gè)例子:
def my_collate(batch):
data = [item[0] for item in batch]
target = [item[1] for item in batch]
target = torch.LongTensor(target)
return [data, target]