Pytorch學習記錄-torchtext和Pytorch的實例對比
0. PyTorch Seq2Seq項目介紹
在完成基本的torchtext之后导绷,找到了這個教程谎僻,《基于Pytorch和torchtext來理解和實現seq2seq模型》束世。
這個項目主要包括了6個子項目
使用神經網絡訓練Seq2Seq使用RNN encoder-decoder訓練短語表示用于統計機器翻譯使用共同學習完成NMT的構建和翻譯打包填充序列、掩碼和推理卷積Seq2SeqTransformer
結束Transformer之后隔了兩天沒有學習岂座,這兩天對幾個模型進行對比和總結吧胶背,在完成前三個模型的時候曾經寫過一個總結,今天主要是看一下六個模型的變化以及實現垂蜗。關鍵是實現楷扬,用了15天,但是模型實現部分只能看懂一般Seq2Seq……
7. 總結么抗,從一般Seq2Seq到Transformer
六個模型都是Seq2Seq毅否,都包含有Encoder和Decoder兩部分,只是模型核心不同蝇刀,并且在層與層或是Encoder與Decoder之間不斷加新東西分別是:LSTM->多層GRU->Attention->PadMaskAttention->CNN->Transformer
- 1和2是一般Seq2Seq螟加,分別使用了LSTM和它的變體GRU
- 3和4是對Attention的補充,增加了打包吞琐、填充捆探、掩碼
- 5是使用CNN
- 6是all-attention,什么高端的都給你用上
7.5 模型5站粟、6對比
<center class="half">
<img src="https://upload-images.jianshu.io/upload_images/14340919-3cd19da0da351933.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240" width="600"/><img src="https://upload-images.jianshu.io/upload_images/14340919-9e7d518eea914b5c.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240" width="600"/>
</center>
最后一個模型了黍图,對比一下,最大的區(qū)別是Transformer是全attention結構奴烙,拋棄了RNN和CNN助被,在之前張俊林的分析中已經說過在“語義特征提取能力、長距離特征捕獲能力切诀、任務綜合特征抽取能力揩环、并行計算能力及運行效率”等方面,均要優(yōu)于CNN和RNN幅虑。在長文本處理方面丰滑,Transforme-XL解決的就是輸入特別長的問題〉光郑基于Pytorch的項目可以看看這個 褒墨。
整體結構還是有一些相似點,CNNSeq2Seq的Encoder和Decoder采用的是層疊卷積抽取特征擎宝,使用GLU作為激活函數郁妈,中間attention使用點乘的方式,最后用殘差連接绍申,把attention計算的權重與輸入序列相乘噩咪,加入到decoder的輸出中輸出輸出序列锄奢。而Transformer在Encoder和Decoder中使用multi head self-attention機制代替CNN。在對輸入序列進行對齊中使用的是padding mask和sequence mask剧腻。mask掩碼拘央。最后使用層歸一和殘差連接結束。在Encoder和Decoder連接處使用multi head context-attention书在。
照例灰伟,來實現一下
class Encoder(nn.Module):
def __init__(self, input_dim, hid_dim, n_layers, n_heads, pf_dim, encoder_layer,
self_attention, positionwise_feedfoward, dropout, device):
super(Encoder, self).__init__()
self.input_dim=input_dim
self.hid_dim=hid_dim
self.n_layers=n_layers
self.n_heads=n_heads
self.pf_dim=pf_dim
self.encoder_layer=encoder_layer
self.self_attention=self_attention
self.positionwise_feedforward=positionwise_feedforward
self.dropout=dropout
self.device=device
self.tok_embedding=nn.Embedding(input_dim, emb_dim)
# word embedding的定義nn.Embedding(1000, hid_dim),這里的1000表示有1000個詞儒旬,hid_dim表示維度栏账,其實也就是一個1000xhid_dim的矩陣。
self.pos_embedding=nn.Embedding(1000, hid_dim)
self.layers=nn.ModuleList([encoder_layer(hid_dim,n_heads,pf_dim,self_attention,
positionwise_feedforward,dropout,device)
for _ in range(n_layers)])
self.do=nn.Dropout(dropout)
self.scale=torch.sqrt(torch.FloatTensor([hid_dim])).to(device)
# 除了src還增加了src_mask栈源,就是使用padding mask進行處理
def forward(self, src, src_mask):
pos=torch.arange(0, src.shape[1]).unsqueeze(0).repeat(src.shape[0]).to(self.device)
# 進行scale之后相加
src=self.do(self.tok_embedding(src)*self.scale)+self.pos_embedding(pos)
for layer in self.layers:
src=layer(src,src_mask)
return src
# 層歸一處理
class EncoderLayer(nn.Module):
def __init__(self, hid_dim, n_heads, pf_dim, self_attention, postionwise_feedforward,dropout,device):
super(EncoderLayer,self).__init__()
self.ln=nn.LayerNorm(hid_dim)
self.sa=self_attention(hid_dim,n_heads,dropout,device)
self.pf=postionwise_feedforward(hid_dim, pf_dim,dropout)
self.do=nn.Dropout(dropout)
def forward(self, src, src_mask):
src=self.ln(src+self.do(self.sa(src,src,src,src_mask)))
src=self.ln(src+self.do(self.pf(src)))
return src
# self-attention
class SelfAttention(nn.Module):
def __init__(self, hid_dim, n_heads, dropout, device):
super(SelfAttention,self).__init__()
self.hid_dim=hid_dim
self.n_heads=n_heads
assert hid_dim%n_heads==0
self.w_q=nn.Linear(hid_dim,hid_dim)
self.w_k=nn.Linear(hid_dim, hid_dim)
self.w_v=nn.Linear(hid_dim, hid_dim)
self.fc=nn.Linear(hid_dim,hid_dim)
self.do=nn.Dropout(dropout)
self.scale=torch.sqrt(torch.FloatTensor([hid_dim//n_heads])).to(device)
def forward(self, query, key, value, mask=None):
bsz=query.shape[0]
#query = key = value [batch size, sent len, hid dim]
Q=self.w_q(query)
K=self.w_k(key)
V=self.w_v(value)
#Q, K, V = [batch size, sent len, hid dim]
Q = Q.view(bsz, -1, self.n_heads, self.hid_dim // self.n_heads).permute(0, 2, 1, 3)
K = K.view(bsz, -1, self.n_heads, self.hid_dim // self.n_heads).permute(0, 2, 1, 3)
V = V.view(bsz, -1, self.n_heads, self.hid_dim // self.n_heads).permute(0, 2, 1, 3)
#Q, K, V = [batch size, n heads, sent len, hid dim // n heads]
# 實現attentionQ*K^T/D
energy=torch.matmul(Q,K.permute(0,1,3,2))/self.scale
#energy = [batch size, n heads, sent len, sent len]
if mask is not None:
energy=energy.masked_fill(mask==0, -1e10)
# 實現softmax部分
attention=self.do(F.softmax(energy, dim=-1))
#attention = [batch size, n heads, sent len, sent len]
x=torch.matmul(attention,V)
#x = [batch size, n heads, sent len, hid dim // n heads]
x=x.permute(0,2,1,3).contiguous()
#x = [batch size, sent len, n heads, hid dim // n heads]
x=x.view(bsz, -1, self.n_heads*(self.hid_dim//self.n_heads))
#x = [batch size, src sent len, hid dim]
x=self.fc(x)
return x
class PositionwiseFeedforward(nn.Module):
def __init__(self, hid_dim, pf_dim, dropout):
super(PositionwiseFeedforward,self).__init__()
self.hid_dim=hid_dim
self.pf_dim=pf_dim
self.fc_1=nn.Conv1d(hid_dim,pf_dim,1)
self.fc_2=nn.Conv1d(pf_dim, hid_dim, 1)
self.do=nn.Dropout(dropout)
def forward(self,x):
#x = [batch size, sent len, hid dim]
x = x.permute(0, 2, 1)
#x = [batch size, hid dim, sent len]
x = self.do(F.relu(self.fc_1(x)))
#x = [batch size, ff dim, sent len]
x = self.fc_2(x)
#x = [batch size, hid dim, sent len]
x = x.permute(0, 2, 1)
#x = [batch size, sent len, hid dim]
return x
class Decoder(nn.Module):
def __init__(self, output_dim, hid_dim,n_layers,n_heads,pf_dim,decoder_layer,self_attention,positionwise_feedforward,dropout,device):
super(Decoder,self).__init__()
self.output_dim=output_dim
self.hid_dim=hid_dim
self.n_layers=n_layers
self.n_heads = n_heads
self.pf_dim = pf_dim
self.decoder_layer = decoder_layer
self.self_attention = self_attention
self.positionwise_feedforward = positionwise_feedforward
self.dropout = dropout
self.device = device
self.tok_embedding=nn.Embedding(output_dim, hid_dim)
self.pos_embedding=nn.Embedding(1000,hid_dim)
self.layers=nn.ModuleList([decoder_layer(hid_dim,n_heads,pf_dim,self_attention,positionwise_feedforward,dropout,device) for _ in range(n_layers)])
self.fc=nn.Linear(hid_dim, output_dim)
self.do=nn.Dropout(dropout)
self.scale=torch.sqrt(torch.FloatTensor([hid_dim])).to(device)
def forward(self, trg, src, trg_mask, src_mask):
#trg = [batch_size, trg sent len]
#src = [batch_size, src sent len]
#trg_mask = [batch size, trg sent len]
#src_mask = [batch size, src sent len]
pos=torch.arange(0, trg.shape[1]).unsqueeze(0).repeat(trg.shape[0], 1).to(self.device)
trg=self.do((self.tok_embedding(trg)*self.scale)+self.pos_embedding(pos))
for layer in self.layers:
trg=layer(trg,src,trg_mask,src_mask)
return self.fc(trg)
# Decoder的層歸一挡爵,可以看到trg和src的mask,在下面Seq2Seq部分具體實現的時候是有區(qū)別的
class DecoderLayer(nn.Module):
def __init__(self, hid_dim, n_heads, pf_dim, self_attention, positionwise_feedforward, dropout, device):
super().__init__()
self.ln = nn.LayerNorm(hid_dim)
self.sa = self_attention(hid_dim, n_heads, dropout, device)
self.ea = self_attention(hid_dim, n_heads, dropout, device)
self.pf = positionwise_feedforward(hid_dim, pf_dim, dropout)
self.do = nn.Dropout(dropout)
def forward(self, trg, src, trg_mask, src_mask):
#trg = [batch size, trg sent len, hid dim]
#src = [batch size, src sent len, hid dim]
#trg_mask = [batch size, trg sent len]
#src_mask = [batch size, src sent len]
trg = self.ln(trg + self.do(self.sa(trg, trg, trg, trg_mask)))
trg = self.ln(trg + self.do(self.ea(trg, src, src, src_mask)))
trg = self.ln(trg + self.do(self.pf(trg)))
return trg
class Seq2Seq(nn.Module):
def __init__(self, encoder, decoder, pad_idx, device):
super().__init__()
self.encoder = encoder
self.decoder = decoder
self.pad_idx = pad_idx
self.device = device
# mask機制
def make_masks(self, src, trg):
#src = [batch size, src sent len]
#trg = [batch size, trg sent len]
src_mask = (src != self.pad_idx).unsqueeze(1).unsqueeze(2)
trg_pad_mask = (trg != self.pad_idx).unsqueeze(1).unsqueeze(3)
trg_len = trg.shape[1]
trg_sub_mask = torch.tril(torch.ones((trg_len, trg_len), dtype=torch.uint8, device=self.device))
trg_mask = trg_pad_mask & trg_sub_mask
return src_mask, trg_mask
def forward(self, src, trg):
#src = [batch size, src sent len]
#trg = [batch size, trg sent len]
src_mask, trg_mask = self.make_masks(src, trg)
enc_src = self.encoder(src, src_mask)
#enc_src = [batch size, src sent len, hid dim]
out = self.decoder(trg, enc_src, trg_mask, src_mask)
#out = [batch size, trg sent len, output dim]
return out
這部分就算是結束了甚垦,用了16天完成6個模型茶鹃,學習了torchtext,環(huán)境從本地換到了Colab艰亮,Transformer模型也跑通了闭翩。
但是,這都是別人的東西迄埃,怎么才能消化成自己的呢疗韵?
接下來的目標是2017年的論文《Deep Context Model for Grammatical Error Correction》,這條路好孤單啊……沒有組會沒有團隊侄非,自己一個人玩……