Chatbot------用PyTorch做對(duì)話機(jī)器人
引用
[TOC]
導(dǎo)入頭文件
使用了from __future__ import xxx
可以在python2直晨,python3環(huán)境下運(yùn)行同一份代碼而不出錯(cuò)赘阀,編寫的時(shí)候使用python3規(guī)范即可费就。
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
進(jìn)行數(shù)據(jù)處理
數(shù)據(jù)正規(guī)化
首先之前將句子中的單詞都轉(zhuǎn)換成小寫并去處兩邊的空白格,接著將句子的unicode
編碼轉(zhuǎn)換成Ascii
編碼,便于使用正則表達(dá)式對(duì)文本進(jìn)行正規(guī)化,,再接著為了便于分詞,所以我們將無效字符轉(zhuǎn)換成我們能處理的字符瞳脓,s = re.sub(r"([.!?])", r" \1", s)
意思成將句號(hào),感嘆號(hào)芥永,疑問號(hào)后面前面都加上一個(gè)空格篡殷,\1
的意思是與之前()
里面匹配的東西原封不動(dòng)復(fù)制一下,所以原來匹配到的是句號(hào)那么這次還是句號(hào)埋涧,原來匹配到的是感嘆號(hào)這次還是感嘆號(hào)板辽,s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
,意思是將不是大小寫字母句號(hào)疑問號(hào)感嘆號(hào)的非法字符全部用空格符替換棘催,注意有個(gè)+
號(hào)劲弦,是因?yàn)閷⑺羞B在一起的非法字符只替換一遍,所有加號(hào)一定不要忘了醇坝,s = re.sub(r"\s+", r" ", s).strip()
邑跪,意思是將所有的空白符替換成空格,如果多個(gè)空白符連接在一起那么只替換一次呼猪。
def unicodeToAscii(s):
return ''.join(
c for c in unicodedata.normalize('NFD', s)
if unicodedata.category(c) != 'Mn'
)
# Lowercase, trim, and remove non-letter characters
def normalizeString(s):
s = unicodeToAscii(s.lower().strip())
s = re.sub(r"([.!?])", r" \1", s)
s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
s = re.sub(r"\s+", r" ", s).strip()
return s
讀取數(shù)據(jù)
lines = open(datafile, encoding='utf-8').read().strip().split('\n')
,datafile
是一個(gè)query/response
中間用\t
分隔画畅,
# Read query/response pairs and return a voc object
def readVocs(datafile, corpus_name):
print("Reading lines...")
# Read the file and split into lines
lines = open(datafile, encoding='utf-8').\
read().strip().split('\n')
# Split every line into pairs and normalize
pairs = [[normalizeString(s) for s in l.split('\t')] for l in lines]
voc = Voc(corpus_name)
return voc, pairs
代碼寫的很有靈性,判斷pairs
里面的句子長(zhǎng)度是不是超過了最大長(zhǎng)度宋距,如果超過了最大長(zhǎng)度那么就掉轴踱。
def filterPair(p):
# Input sequences need to preserve the last word for EOS token
return len(p[0].split(' ')) < MAX_LENGTH and len(p[1].split(' ')) < MAX_LENGTH
# Filter pairs using filterPair condition
def filterPairs(pairs):
return [pair for pair in pairs if filterPair(pair)]
讀取并整合數(shù)據(jù),voc就是一個(gè)大的單詞對(duì)應(yīng)表谚赎,里面有word2index
淫僻,有word2count
诱篷,有index2word
def loadPrepareData(corpus, corpus_name, datafile, save_dir):
print("Start preparing training data ...")
voc, pairs = readVocs(datafile, corpus_name)
print("Read {!s} sentence pairs".format(len(pairs)))
pairs = filterPairs(pairs)
print("Trimmed to {!s} sentence pairs".format(len(pairs)))
print("Counting words...")
for pair in pairs:
voc.addSentence(pair[0])
voc.addSentence(pair[1])
print("Counted words:", voc.num_words)
return voc, pairs
# Load/Assemble voc and pairs
save_dir = os.path.join("data", "save")
voc, pairs = loadPrepareData(corpus, corpus_name, datafile, save_dir)
# Print some pairs to validate
print("\npairs:")
for pair in pairs[:10]:
print(pair)
根據(jù)單詞表對(duì)句子進(jìn)行裁剪,首先先把單詞表中詞頻小于MIN_COUNT的單詞全部丟棄雳灵,然后對(duì)于句子判斷如果句子中有詞頻過小的單詞棕所,那么整個(gè)句子也不用保留。
MIN_COUNT = 3 # Minimum word count threshold for trimming
def trimRareWords(voc, pairs, MIN_COUNT):
# Trim words used under the MIN_COUNT from the voc
voc.trim(MIN_COUNT)
# Filter out pairs with trimmed words
keep_pairs = []
for pair in pairs:
input_sentence = pair[0]
output_sentence = pair[1]
keep_input = True
keep_output = True
# Check input sentence
for word in input_sentence.split(' '):
if word not in voc.word2index:
keep_input = False
break
# Check output sentence
for word in output_sentence.split(' '):
if word not in voc.word2index:
keep_output = False
break
# Only keep pairs that do not contain trimmed word(s) in their input or output sentence
if keep_input and keep_output:
keep_pairs.append(pair)
print("Trimmed from {} pairs to {}, {:.4f} of total".format(len(pairs), len(keep_pairs), len(keep_pairs) / len(pairs)))
return keep_pairs
# Trim voc and pairs
pairs = trimRareWords(voc, pairs, MIN_COUNT)
將句子轉(zhuǎn)換成Tensor
預(yù)處理的最后一步悯辙,在訓(xùn)練翻譯模型的時(shí)候我們都是一個(gè)句子一個(gè)句子進(jìn)行訓(xùn)練的琳省,但是為了加速訓(xùn)練,最好還是使用Mini-batch進(jìn)行訓(xùn)練笑撞,如果想要使用Mini-batch 進(jìn)行訓(xùn)練岛啸,那么就要讓一個(gè)batch里面的句子長(zhǎng)度一樣長(zhǎng),如果一個(gè)句子太短了茴肥,那么就在EOS_token
后面進(jìn)行0
填充,,這樣構(gòu)建出來的Mini_batch
的形狀是batch_size*max_length
,為了編寫代碼方便荡灾,我們需要batch[0]
指示所有句子第一個(gè)單詞(總共有batch_size
個(gè))瓤狐,所以我們還需要把這個(gè)Mini_batch`的矩陣轉(zhuǎn)置一下。
- 將所有的單詞轉(zhuǎn)換成
index
批幌,然后加上表示結(jié)束的index
---EOS_token
础锐。
def indexesFromSentence(voc, sentence):
return [voc.word2index[word] for word in sentence.split(' ')] + [EOS_token]
2.itertools.zip_longest
函數(shù)第一個(gè)參數(shù)是聚合在一起的元素(l是一個(gè)二維矩陣,*l(解壓操作)是去除最外面的中括號(hào)荧缘,所以就變成了一堆聚合在一起的元素了)皆警,函數(shù)第二個(gè)參數(shù)是要填充的字符值,函數(shù)功能是對(duì)長(zhǎng)度不一樣的聚合在一起的元素截粗,使用fillvalue
進(jìn)行填充,填充成最大元素的長(zhǎng)度意推。
itertools.zip_longest:Make an iterator that aggregates elements from each of the iterables. If the iterables are of uneven length, missing values are filled-in with fillvalue. Iteration continues until the longest iterable is exhausted.
def zeroPadding(l, fillvalue=PAD_token):
return list(itertools.zip_longest(*l, fillvalue=fillvalue))
- 制作掩碼矩陣珊蟀,填充的部分設(shè)置為0,非填充的部分設(shè)置為1育灸。
def binaryMatrix(l, value=PAD_token):
m = []
for i, seq in enumerate(l):
m.append([])
for token in seq:
if token == PAD_token:
m[i].append(0)
else:
m[i].append(1)
return m
-
inputVar
和outputVar
是將一個(gè)batch
的數(shù)據(jù)進(jìn)行word
和tensor
的相互轉(zhuǎn)換。inputVar
磅崭,第一句是先將sentence
轉(zhuǎn)換成index
矩陣,第二句是將創(chuàng)建和index
矩陣同樣大小的Tensor
矩陣典徊,torch.tensor([a,b,c,d])
創(chuàng)建a * b * c * d
大小的Tensor
矩陣卒落。接下來兩句是對(duì)index
矩陣進(jìn)行填充,然后成成對(duì)應(yīng)的Tensor
矩陣也切,torch.LongTensor(list)
創(chuàng)建一個(gè)和list
同樣形狀的Tensor
矩陣,batch2TrainData
就是將雷恃,隨機(jī)采樣到的小批數(shù)據(jù)倒槐,生成訓(xùn)練中需要的tensor
數(shù)據(jù)讨越。
# Returns padded input sequence tensor and lengths
def inputVar(l, voc):
indexes_batch = [indexesFromSentence(voc, sentence) for sentence in l]
lengths = torch.tensor([len(indexes) for indexes in indexes_batch])
padList = zeroPadding(indexes_batch)
padVar = torch.LongTensor(padList)
return padVar, lengths
# Returns padded target sequence tensor, padding mask, and max target length
def outputVar(l, voc):
indexes_batch = [indexesFromSentence(voc, sentence) for sentence in l]
max_target_len = max([len(indexes) for indexes in indexes_batch])
padList = zeroPadding(indexes_batch)
mask = binaryMatrix(padList)
mask = torch.ByteTensor(mask)
padVar = torch.LongTensor(padList)
return padVar, mask, max_target_len
# Returns all items for a given batch of pairs
def batch2TrainData(voc, pair_batch):
pair_batch.sort(key=lambda x: len(x[0].split(" ")), reverse=True)
input_batch, output_batch = [], []
for pair in pair_batch:
input_batch.append(pair[0])
output_batch.append(pair[1])
inp, lengths = inputVar(input_batch, voc)
output, mask, max_target_len = outputVar(output_batch, voc)
return inp, lengths, output, mask, max_target_len
# Example for validation
small_batch_size = 5
batches = batch2TrainData(voc, [random.choice(pairs) for _ in range(small_batch_size)])
input_variable, lengths, target_variable, mask, max_target_len = batches
print("input_variable:", input_variable)
print("lengths:", lengths)
print("target_variable:", target_variable)
print("mask:", mask)
print("max_target_len:", max_target_len)
構(gòu)建模型
Encoder
nn.GRU(input_size,hidden_size,n_layers,...)
,第一個(gè)參數(shù)輸入vector
的長(zhǎng)度,第二個(gè)根據(jù)GRU
的計(jì)算公式着逐,是輸出和隱藏vector
的長(zhǎng)度(因?yàn)槭巧弦粋€(gè)time_step
的輸出和這個(gè)階段的h
共同組合成這個(gè)階段的輸出)耸别,n_layers
就是需要堆疊的GRU的層數(shù)太雨,如果大于1的話囊扳,就是將n
層GRU
摞起來锥咸,nn.Embedding(voc_size,dimension,..)
搏予,第一個(gè)參數(shù)是表示需要embedding
的單詞的數(shù)目雪侥,dimension
是embedding
之后每一個(gè)單詞的vector
的長(zhǎng)度哼凯,torch.nn.utils.rnn.pack_padded_sequence(input, lengths, batch_first=False)
,將一個(gè)填充過的tensor
打包起來旬牲,input
的形狀是max_length * batch_size
,如果batch_first=True
則更換兩者的順序原茅,并且輸入是一個(gè)已經(jīng)排好序(降序)的tensor
矩陣擂橘,長(zhǎng)的在前贝室,短的在后滑频,lengths
是batch
中輸入序列每個(gè)元素的長(zhǎng)度的一個(gè)列表峡迷,這一函數(shù)的作用就是打包绘搞,進(jìn)行批處理(將多個(gè)Tensor
結(jié)合在一起夯辖,而且排序也是很有必要的蒿褂,能和原來的元素一對(duì)應(yīng)起來)啄栓,torch.nn.utils.rnn.pad_packed_sequence
,是上個(gè)函數(shù)的逆操作,相當(dāng)于解壓操作堪旧,接下來一句淳梦,因?yàn)槲覀兪褂玫氖请p向的LSTM所以會(huì)有兩倍的輸出谭跨,我們將其相加起來組成最后的輸出螃宙,谆扎。
Packs a Tensor containing padded sequences of variable length.Input can be of size T x B x * where T is the length of the longest sequence (equal to lengths[0]), B is the batch size, and * is any number of dimensions (including 0). If batch_first is True B x T x * inputs are expected.The sequences should be sorted by length in a decreasing order, i.e. input[:,0] should be the longest sequence, and input[:,B-1] the shortest one.This function accepts any input that has at least two dimensions. You can apply it to pack the labels, and use the output of the RNN with them to compute the loss directly.
class EncoderRNN(nn.Module):
def __init__(self, hidden_size, embedding, n_layers=1, dropout=0):
super(EncoderRNN, self).__init__()
self.n_layers = n_layers
self.hidden_size = hidden_size
self.embedding = embedding
# Initialize GRU; the input_size and hidden_size params are both set to 'hidden_size'
# because our input size is a word embedding with number of features == hidden_size
self.gru = nn.GRU(hidden_size, hidden_size, n_layers,
dropout=(0 if n_layers == 1 else dropout), bidirectional=True)
def forward(self, input_seq, input_lengths, hidden=None):
# Convert word indexes to embeddings
embedded = self.embedding(input_seq)
# Pack padded batch of sequences for RNN module
packed = torch.nn.utils.rnn.pack_padded_sequence(embedded, input_lengths)
# Forward pass through GRU
outputs, hidden = self.gru(packed, hidden)
# Unpack padding
outputs, _ = torch.nn.utils.rnn.pad_packed_sequence(outputs)
# Sum bidirectional GRU outputs
outputs = outputs[:, :, :self.hidden_size] + outputs[:, : ,self.hidden_size:]
# Return output and final hidden state
return outputs, hidden
Attention
完成了一個(gè)關(guān)于注意力機(jī)制的類,這樣就不用每個(gè)模型都寫一個(gè)attention
了伺糠,拿來即用训桶,W
是一個(gè)矩陣變換舵揭,在函數(shù)中相當(dāng)于torch.nn.Linear
午绳,就是實(shí)現(xiàn)這個(gè)W
轉(zhuǎn)換拦焚,所以在__init__
里面的首先要初始化這個(gè)能進(jìn)行W
變換的submodule
耕漱,torch.nn.Linear(in_features, out_features, bias=True)
螟够,以general
為例若河,in_features
是h_hat_s
的長(zhǎng)度萧福,out_features
就是經(jīng)過線性變換以后特征的維度鲫忍,(hidden.expand(encoder_output.size(0), -1, -1)
悟民,作用是將某一個(gè)維度是1的維擴(kuò)大到指定的維度射亏,這個(gè)例子中hidden
的形狀是1*n*m
智润,然后擴(kuò)展到encoder_output.size(0)*n*m
窟绷,-1表示不擴(kuò)展這個(gè)維度,值得注意的是擴(kuò)展維度并不會(huì)多耗費(fèi)內(nèi)存,只是更改了視圖(view)把stride設(shè)置為0献宫,torch.cat(A,B,n)
表示在第n維上連接兩個(gè)tensor
,torch.sum(input,dims,keepdim=False...)
表示在維度(可以為一個(gè)列表)上進(jìn)行加法姊途,并使用torch.squeeze()
對(duì)結(jié)果進(jìn)行擠壓捷兰,如果keepdim=True
則不進(jìn)行擠壓,tensor.t()
對(duì)tensor
進(jìn)行轉(zhuǎn)置顶考,這個(gè)函數(shù)只能對(duì)tensor
是二維張量驹沿,將矩陣進(jìn)行轉(zhuǎn)置渊季,
that have a very special property when used with Module s - when they’re assigned as Module attributes they are automatically added to the list of its parameters, and will appear e.g. in parameters() iterator. Assigning a Tensor doesn’t have such effect.
# Luong attention layer
class Attn(torch.nn.Module):
def __init__(self, method, hidden_size):
super(Attn, self).__init__()
self.method = method
if self.method not in ['dot', 'general', 'concat']:
raise ValueError(self.method, "is not an appropriate attention method.")
self.hidden_size = hidden_size
if self.method == 'general':
self.attn = torch.nn.Linear(self.hidden_size, hidden_size)
elif self.method == 'concat':
self.attn = torch.nn.Linear(self.hidden_size * 2, hidden_size)
self.v = torch.nn.Parameter(torch.FloatTensor(hidden_size))
def dot_score(self, hidden, encoder_output):
return torch.sum(hidden * encoder_output, dim=2)
def general_score(self, hidden, encoder_output):
energy = self.attn(encoder_output)
return torch.sum(hidden * energy, dim=2)
def concat_score(self, hidden, encoder_output):
energy = self.attn(torch.cat((hidden.expand(encoder_output.size(0), -1, -1), encoder_output), 2)).tanh()
return torch.sum(self.v * energy, dim=2)
def forward(self, hidden, encoder_outputs):
# Calculate the attention weights (energies) based on the given method
if self.method == 'general':
attn_energies = self.general_score(hidden, encoder_outputs)
elif self.method == 'concat':
attn_energies = self.concat_score(hidden, encoder_outputs)
elif self.method == 'dot':
attn_energies = self.dot_score(hidden, encoder_outputs)
# Transpose max_length and batch_size dimensions
attn_energies = attn_energies.t()
# Return the softmax normalized probability scores (with added dimension)
return F.softmax(attn_energies, dim=1).unsqueeze(1)
Decoder
nn.Dropout()
,embedded = self.embedding_dropout(embedded)
就是隨機(jī)歸0
一些tensor
,tensorA.bmm(tensorB)
相當(dāng)于torch.bmm(tensorA,tensorB)
都是對(duì)兩個(gè)tensor
進(jìn)行批量相乘赎懦,注意第一維度是batch_size
,tensor.squeeze(tensorA,dim=None)
励两,如果不指定維度那么會(huì)將張量里面的所有維度為1的全部消除[2,1,2,1,2]->[2,2,2]
,如果指定了dim=1
,[2,1,2,1,2]->[2,2,1,2]
当悔。torch.tanh
的目的是加入一些非線性變換盲憎,注意rnn_output, hidden = self.gru(embedded, last_hidden)
這里得到的是一個(gè)時(shí)間步里面的結(jié)果,每次一個(gè)輸出和一個(gè)隱藏狀態(tài)窑眯,做.transpose(0,1)
也是因?yàn)橛?xùn)練的時(shí)候batch_size
在第一維現(xiàn)在要求是第零維磅甩。卷要。
m = nn.Dropout(p=0.2)
input = torch.randn(20, 16)
output = m(input)
print(output)
class AttnDecoderRNN(nn.Module):
def __init__(self, attn_model, embedding, hidden_size, output_size, n_layers=1, dropout=0.1):
super(LuongAttnDecoderRNN, self).__init__()
# Keep for reference
self.attn_model = attn_model
self.hidden_size = hidden_size
self.output_size = output_size
self.n_layers = n_layers
self.dropout = dropout
# Define layers
self.embedding = embedding
self.embedding_dropout = nn.Dropout(dropout)
self.gru = nn.GRU(hidden_size, hidden_size, n_layers, dropout=(0 if n_layers == 1 else dropout))
self.concat = nn.Linear(hidden_size * 2, hidden_size)
self.out = nn.Linear(hidden_size, output_size)
self.attn = Attn(attn_model, hidden_size)
def forward(self, input_step, last_hidden, encoder_outputs):
# Note: we run this one step (word) at a time
# Get embedding of current input word
embedded = self.embedding(input_step)
embedded = self.embedding_dropout(embedded)
# Forward through unidirectional GRU
rnn_output, hidden = self.gru(embedded, last_hidden)
# Calculate attention weights from the current GRU output
attn_weights = self.attn(rnn_output, encoder_outputs)
# Multiply attention weights to encoder outputs to get new "weighted sum" context vector
context = attn_weights.bmm(encoder_outputs.transpose(0, 1))
# Concatenate weighted context vector and GRU output using Luong eq. 5
rnn_output = rnn_output.squeeze(0)
context = context.squeeze(1)
concat_input = torch.cat((rnn_output, context), 1)
concat_output = torch.tanh(self.concat(concat_input))
# Predict next word using Luong eq. 6
output = self.out(concat_output)
output = F.softmax(output, dim=1)
# Return output and final hidden state
return output, hidden
LossFunction
torch.gather(input, dim, index, out=None)
,這個(gè)函數(shù)的作用是將input按照dim所指定的維度,按照index中的次序進(jìn)行收集倍权,輸出的尺寸和index是一樣的薄声,比如下面的例子dim=1
指示對(duì)input
的第1維進(jìn)行收集默辨,且有2個(gè)第一維,所以index
的尺寸是2*n
表谊,此處n=2爆办,指示了每個(gè)第一維元素應(yīng)該如何收集([1,2]
按照[0,0]
收集距辆,[3,4]
按照[0,1]
收集),input
和index
的維度跨算,只能是dim
所指示的哪一個(gè)維度可以不一樣,其他的必須完全一樣。tensor.masked_select(mask).mean()
,torch.masked_select(input, mask, out=None) → Tensor
,將input
按照mask
的中為1的元素的順序收集成一個(gè)一維的tensor
狂窑,mask
不是必須要和input
同一緯度泉哈,但是必須是可以broadcastable
奕纫。
If input is an n-dimensional tensor with size (x0,x1...,xi?1,xi,xi+1,...,xn?1) and dim =i, then index must be an n-dimensional tensor with size (x0,x1,...,xi?1,y,xi+1,...,xn?1) where y≥1 and out will have the same size as index.
Returns a new 1-D tensor which indexes the input tensor according to the binary mask mask which is a ByteTensor.The shapes of the mask tensor and the input tensor don’t need to match, but they must be broadcastable.
>>> t = torch.tensor([[1,2],[3,4]])
>>> torch.gather(t, 1, torch.tensor([[0,0],[1,0]]))
tensor([[ 1, 1],
[ 4, 3]])
>>> x = torch.randn(3, 4)
>>> x
tensor([[ 0.3552, -2.3825, -0.8297, 0.3477],
[-1.2035, 1.2252, 0.5002, 0.6248],
[ 0.1307, -2.0608, 0.1244, 2.0139]])
>>> mask = x.ge(0.5)
>>> mask
tensor([[ 0, 0, 0, 0],
[ 0, 1, 1, 1],
[ 0, 0, 0, 1]], dtype=torch.uint8)
>>> torch.masked_select(x, mask)
tensor([ 1.2252, 0.5002, 0.6248, 2.0139])
def maskNLLLoss(inp, target, mask):
nTotal = mask.sum()
crossEntropy = -torch.log(torch.gather(inp, 1, target.view(-1, 1)))
loss = crossEntropy.masked_select(mask).mean()
loss = loss.to(device)
return loss, nTotal.item()
訓(xùn)練
一個(gè)時(shí)間步的訓(xùn)練
train
指示一個(gè)Mini_batch
的一個(gè)時(shí)間步的訓(xùn)練。tensor.topk(n)
,返回tensor
最大的n個(gè)數(shù)及其下標(biāo)_, topi = decoder_output.topk(1),decoder_input = torch.LongTensor([[topi[i][0] for i in range(batch_size)]])
此處的是一個(gè)batch
,所以先取到topi[i]
在top[i].item()
取出元素升筏。torch.nn.utils.clip_grad_norm_(decoder.parameters(), clip)
進(jìn)行梯度裁剪,防止梯度爆炸灵汪,第二個(gè)參數(shù)是max norm of the gradients
享言,正規(guī)化以后梯度的最大值担锤。
>>> x = torch.arange(1., 6.)
>>> x
tensor([ 1., 2., 3., 4., 5.])
>>> torch.topk(x, 3)
(tensor([ 5., 4., 3.]), tensor([ 4, 3, 2]))
def train(input_variable, lengths, target_variable, mask, max_target_len, encoder, decoder, embedding,
encoder_optimizer, decoder_optimizer, batch_size, clip, max_length=MAX_LENGTH):
# Zero gradients
encoder_optimizer.zero_grad()
decoder_optimizer.zero_grad()
# Set device options
input_variable = input_variable.to(device)
lengths = lengths.to(device)
target_variable = target_variable.to(device)
mask = mask.to(device)
# Initialize variables
loss = 0
print_losses = []
n_totals = 0
# Forward pass through encoder
encoder_outputs, encoder_hidden = encoder(input_variable, lengths)
# Create initial decoder input (start with SOS tokens for each sentence)
decoder_input = torch.LongTensor([[SOS_token for _ in range(batch_size)]])
decoder_input = decoder_input.to(device)
# Set initial decoder hidden state to the encoder's final hidden state
decoder_hidden = encoder_hidden[:decoder.n_layers]
# Determine if we are using teacher forcing this iteration
use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False
# Forward batch of sequences through decoder one time step at a time
if use_teacher_forcing:
for t in range(max_target_len):
decoder_output, decoder_hidden = decoder(
decoder_input, decoder_hidden, encoder_outputs
)
# Teacher forcing: next input is current target
decoder_input = target_variable[t].view(1, -1)
# Calculate and accumulate loss
mask_loss, nTotal = maskNLLLoss(decoder_output, target_variable[t], mask[t])
loss += mask_loss
print_losses.append(mask_loss.item() * nTotal)
n_totals += nTotal
else:
for t in range(max_target_len):
decoder_output, decoder_hidden = decoder(
decoder_input, decoder_hidden, encoder_outputs
)
# No teacher forcing: next input is decoder's own current output
_, topi = decoder_output.topk(1)
decoder_input = torch.LongTensor([[topi[i][0] for i in range(batch_size)]])
decoder_input = decoder_input.to(device)
# Calculate and accumulate loss
mask_loss, nTotal = maskNLLLoss(decoder_output, target_variable[t], mask[t])
loss += mask_loss
print_losses.append(mask_loss.item() * nTotal)
n_totals += nTotal
# Perform backpropatation
loss.backward()
# Clip gradients: gradients are modified in place
_ = torch.nn.utils.clip_grad_norm_(encoder.parameters(), clip)
_ = torch.nn.utils.clip_grad_norm_(decoder.parameters(), clip)
# Adjust model weights
encoder_optimizer.step()
decoder_optimizer.step()
return sum(print_losses) / n_totals
訓(xùn)練
training_batches = [batch2TrainData(voc, [random.choice(pairs) for _ in range(batch_size)]) for _ in range(n_iteration)]
多糠,首先把所有的訓(xùn)練數(shù)據(jù)采樣下來夹孔。然后每個(gè)循環(huán)拿出一小批數(shù)據(jù)進(jìn)行訓(xùn)練搭伤。if loadFilename:start_iteration = checkpoint['iteration'] + 1
很有靈性的寫法,雖然感覺用處不會(huì)很大拍鲤,input_variable, lengths, target_variable, mask, max_target_len = training_batch
,從training_batch中取出各個(gè)字段季稳。保存數(shù)據(jù)這一段寫的太經(jīng)典了景鼠。
if (iteration % save_every == 0):
directory = os.path.join(save_dir, model_name, corpus_name, '{}-{}_{}'.format(encoder_n_layers, decoder_n_layers, hidden_size))
if not os.path.exists(directory):
os.makedirs(directory)
torch.save({
'iteration': iteration,
'en': encoder.state_dict(),
'de': decoder.state_dict(),
'en_opt': encoder_optimizer.state_dict(),
'de_opt': decoder_optimizer.state_dict(),
'loss': loss,
'voc_dict': voc.__dict__,
'embedding': embedding.state_dict()
}, os.path.join(directory, '{}_{}.tar'.format(iteration, 'checkpoint')))
def trainIters(model_name, voc, pairs, encoder, decoder, encoder_optimizer, decoder_optimizer, embedding, encoder_n_layers, decoder_n_layers, save_dir, n_iteration, batch_size, print_every, save_every, clip, corpus_name, loadFilename):
# Load batches for each iteration
training_batches = [batch2TrainData(voc, [random.choice(pairs) for _ in range(batch_size)])
for _ in range(n_iteration)]
# Initializations
print('Initializing ...')
start_iteration = 1
print_loss = 0
if loadFilename:
start_iteration = checkpoint['iteration'] + 1
# Training loop
print("Training...")
for iteration in range(start_iteration, n_iteration + 1):
training_batch = training_batches[iteration - 1]
# Extract fields from batch
input_variable, lengths, target_variable, mask, max_target_len = training_batch
# Run a training iteration with batch
loss = train(input_variable, lengths, target_variable, mask, max_target_len, encoder,
decoder, embedding, encoder_optimizer, decoder_optimizer, batch_size, clip)
print_loss += loss
# Print progress
if iteration % print_every == 0:
print_loss_avg = print_loss / print_every
print("Iteration: {}; Percent complete: {:.1f}%; Average loss: {:.4f}".format(iteration, iteration / n_iteration * 100, print_loss_avg))
print_loss = 0
# Save checkpoint
if (iteration % save_every == 0):
directory = os.path.join(save_dir, model_name, corpus_name, '{}-{}_{}'.format(encoder_n_layers, decoder_n_layers, hidden_size))
if not os.path.exists(directory):
os.makedirs(directory)
torch.save({
'iteration': iteration,
'en': encoder.state_dict(),
'de': decoder.state_dict(),
'en_opt': encoder_optimizer.state_dict(),
'de_opt': decoder_optimizer.state_dict(),
'loss': loss,
'voc_dict': voc.__dict__,
'embedding': embedding.state_dict()
}, os.path.join(directory, '{}_{}.tar'.format(iteration, 'checkpoint')))
模型評(píng)估
貪婪搜索
decoder_input = torch.ones(1, 1, device=device, dtype=torch.long) * SOS_toke
票渠,*一個(gè)常數(shù)就是將tensor
的每一個(gè)元素都乘以這個(gè)常數(shù)问顷,做成1*1
的tensor
可能就是因?yàn)檎f明這一批就只有一個(gè)元素,torch.max(input,dim)
杜窄,返回inputTensor
中指定維度的最大值以及下標(biāo),如果不指定維度那么只返回最大的數(shù)蚀腿,這個(gè)指定維度是便利所有元素值所得到的最大的元素莉钙,是總體的最大元素磁玉,所以input
的維度可以是m*n
蚊伞,貪婪搜索decoder只用將所有的輸入序列進(jìn)行一次encoder
,但是對(duì)decoder
要運(yùn)行max_length
次别垮,比較有意思的事是碳想,all_tokens
首先誰初始化成起始符0胧奔,然后每次將最有可能的那個(gè)詞使用torch.cat
連接起來,因?yàn)?code>torch.max輸出的decoder_input
是一維的岩遗,而輸入要求是二維的所有還要使用torch.unsqueeze
進(jìn)行擴(kuò)維宿礁。
>>> a = torch.randn(4, 4)
>>> a
tensor([[-1.2360, -0.2942, -0.1222, 0.8475],
[ 1.1949, -1.1127, -2.2379, -0.6702],
[ 1.5717, -0.9207, 0.1297, -1.8768],
[-0.6172, 1.0036, -0.6060, -0.2432]])
>>> torch.max(a, 1)
(tensor([ 0.8475, 1.1949, 1.5717, 1.0036]), tensor([ 3, 0, 0, 1]))
>>> a = torch.randn(1, 3)
>>> a
tensor([[ 0.6763, 0.7445, -2.2369]])
>>> torch.max(a)
tensor(0.7445)
class GreedySearchDecoder(nn.Module):
def __init__(self, encoder, decoder):
super(GreedySearchDecoder, self).__init__()
self.encoder = encoder
self.decoder = decoder
def forward(self, input_seq, input_length, max_length):
# Forward input through encoder model
encoder_outputs, encoder_hidden = self.encoder(input_seq, input_length)
# Prepare encoder's final hidden layer to be first hidden input to the decoder
decoder_hidden = encoder_hidden[:decoder.n_layers]
# Initialize decoder input with SOS_token
decoder_input = torch.ones(1, 1, device=device, dtype=torch.long) * SOS_token
# Initialize tensors to append decoded words to
all_tokens = torch.zeros([0], device=device, dtype=torch.long)
all_scores = torch.zeros([0], device=device)
# Iteratively decode one word token at a time
for _ in range(max_length):
# Forward pass through decoder
decoder_output, decoder_hidden = self.decoder(decoder_input, decoder_hidden, encoder_outputs)
# Obtain most likely word token and its softmax score
decoder_scores, decoder_input = torch.max(decoder_output, dim=1)
# Record token and score
all_tokens = torch.cat((all_tokens, decoder_input), dim=0)
all_scores = torch.cat((all_scores, decoder_scores), dim=0)
# Prepare current token to be next decoder input (add a dimension)
decoder_input = torch.unsqueeze(decoder_input, 0)
# Return collections of word tokens and scores
return all_tokens, all_scores
評(píng)估函數(shù)
-
indexes_batch = [indexesFromSentence(voc, sentence)]
得到句子的index
下標(biāo)笔诵,句子sentence
是問答系統(tǒng)的一個(gè)句子乎婿,lengths
的Tensor矩陣捍靠,input_batch = torch.LongTensor(indexes_batch).transpose(0, 1)
將batch_size
放在了第二維剂公,decoded_words = [voc.index2word[token.item()] for token in tokens]
,index2word
是voc
實(shí)例里面的一個(gè)屬性纲辽。 -
input(">")
作用是先輸出一個(gè)>
然后獲取直到回車的字符串并返回,然后對(duì)輸入的句子進(jìn)行noramlizeString
吊档,加空格,去除非字母的符號(hào)唾糯,output_words[:] = [x for x in output_words if not (x == 'EOS' or x == 'PAD')]
很有靈性的一段代碼怠硼,去除結(jié)束符和填充符號(hào)鬼贱,但是寫[:]
好像沒用呀。print('Bot:', ' '.join(output_words))
將列表中的元素連接成字符串香璃。
"""
output_words[:]相當(dāng)先把output_words的元素全部復(fù)制一遍然后存到以地址output_words[:]開始的地方这难。
嚴(yán)格的說葡秒,python沒有賦值姻乓,只有名字到對(duì)象的綁定。所以L1=L是把L所指的對(duì)象綁定到名字L1上眯牧,而L2=L[:]則是把L通過切片運(yùn)算取得的新列表對(duì)象綁定到L2上蹋岩。前者兩個(gè)名字指向同一個(gè)對(duì)象,后者兩個(gè)名字指向不同對(duì)象学少。換句話說剪个,L1和L是指的同一個(gè)東西,那么修改L1也就修改了L旱易;L2則是不同的東西禁偎,修改L2不會(huì)改變L。注意這個(gè)引用的概念對(duì)于所有的東西都成立阀坏,例如容器內(nèi)部存儲(chǔ)的都是引用如暖。
"""
i=[1,2,3,4,5]
l=i[:]
i[2]=9
print(i)
print(l)
---------------------
[1, 2, 9, 4, 5]
[1, 2, 3, 4, 5]
---------------------
i=[1,2,3,4,5]
i[:]=[6,7,8]
print(i)
--------------------
[6, 7, 8]
[6, 7, 8]
def evaluate(encoder, decoder, searcher, voc, sentence, max_length=MAX_LENGTH):
### Format input sentence as a batch
# words -> indexes
indexes_batch = [indexesFromSentence(voc, sentence)]
# Create lengths tensor
lengths = torch.tensor([len(indexes) for indexes in indexes_batch])
# Transpose dimensions of batch to match models' expectations
input_batch = torch.LongTensor(indexes_batch).transpose(0, 1)
# Use appropriate device
input_batch = input_batch.to(device)
lengths = lengths.to(device)
# Decode sentence with searcher
tokens, scores = searcher(input_batch, lengths, max_length)
# indexes -> words
decoded_words = [voc.index2word[token.item()] for token in tokens]
return decoded_words
def evaluateInput(encoder, decoder, searcher, voc):
input_sentence = ''
while(1):
try:
# Get input sentence
input_sentence = input('> ')
# Check if it is quit case
if input_sentence == 'q' or input_sentence == 'quit': break
# Normalize sentence
input_sentence = normalizeString(input_sentence)
# Evaluate sentence
output_words = evaluate(encoder, decoder, searcher, voc, input_sentence)
# Format and print response sentence
output_words[:] = [x for x in output_words if not (x == 'EOS' or x == 'PAD')]
print('Bot:', ' '.join(output_words))
except KeyError:
print("Error: Encountered unknown word.")
運(yùn)行模型
-
__dict__
,python中的類忌堂,都會(huì)從object里繼承一個(gè)__dict__
屬性盒至,這個(gè)屬性中存放著類的屬性和方法對(duì)應(yīng)的鍵值對(duì)。一個(gè)類實(shí)例化之后士修,這個(gè)類的實(shí)例也具有這么一個(gè)__dict__
屬性枷遂。但是二者并不相同。
In [26]: class A:
...: some = 1
...: def __init__(self,num):
...: self.num = num
...:
In [27]: a = A(10)
In [28]: print(a.__dict__)
{'num': 10}
In [30]: a.age = 10
In [31]: print(a.__dict__)
{'num': 10, 'age': 10}
- 加載模型棋嘲,1.首先創(chuàng)建模型酒唉,2.使用
model.load_state_dict(embedding_sd)
加載模型參數(shù)。
embedding = nn.Embedding(voc.num_words, hidden_size)
if loadFilename:
embedding.load_state_dict(embedding_sd)
# Initialize encoder & decoder models
encoder = EncoderRNN(hidden_size, embedding, encoder_n_layers, dropout)
decoder = LuongAttnDecoderRNN(attn_model, embedding, hidden_size, voc.num_words, decoder_n_layers, dropout)
if loadFilename:
encoder.load_state_dict(encoder_sd)
decoder.load_state_dict(decoder_sd)
encoder = encoder.to(device)
decoder = decoder.to(device)
# Configure models
model_name = 'cb_model'
attn_model = 'dot'
#attn_model = 'general'
#attn_model = 'concat'
hidden_size = 500
encoder_n_layers = 2
decoder_n_layers = 2
dropout = 0.1
batch_size = 64
# Set checkpoint to load from; set to None if starting from scratch
loadFilename = None
checkpoint_iter = 4000
#loadFilename = os.path.join(save_dir, model_name, corpus_name,
# '{}-{}_{}'.format(encoder_n_layers, decoder_n_layers, hidden_size),
# '{}_checkpoint.tar'.format(checkpoint_iter))
# Load model if a loadFilename is provided
if loadFilename:
# If loading on same machine the model was trained on
checkpoint = torch.load(loadFilename)
# If loading a model trained on GPU to CPU
#checkpoint = torch.load(loadFilename, map_location=torch.device('cpu'))
encoder_sd = checkpoint['en']
decoder_sd = checkpoint['de']
encoder_optimizer_sd = checkpoint['en_opt']
decoder_optimizer_sd = checkpoint['de_opt']
embedding_sd = checkpoint['embedding']
voc.__dict__ = checkpoint['voc_dict']
print('Building encoder and decoder ...')
# Initialize word embeddings
embedding = nn.Embedding(voc.num_words, hidden_size)
if loadFilename:
embedding.load_state_dict(embedding_sd)
# Initialize encoder & decoder models
encoder = EncoderRNN(hidden_size, embedding, encoder_n_layers, dropout)
decoder = LuongAttnDecoderRNN(attn_model, embedding, hidden_size, voc.num_words, decoder_n_layers, dropout)
if loadFilename:
encoder.load_state_dict(encoder_sd)
decoder.load_state_dict(decoder_sd)
# Use appropriate device
encoder = encoder.to(device)
decoder = decoder.to(device)
print('Models built and ready to go!')
運(yùn)行模型
# Configure training/optimization
clip = 50.0
teacher_forcing_ratio = 1.0
learning_rate = 0.0001
decoder_learning_ratio = 5.0
n_iteration = 4000
print_every = 1
save_every = 500
# Ensure dropout layers are in train mode
encoder.train()
decoder.train()
# Initialize optimizers
print('Building optimizers ...')
encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate * decoder_learning_ratio)
if loadFilename:
encoder_optimizer.load_state_dict(encoder_optimizer_sd)
decoder_optimizer.load_state_dict(decoder_optimizer_sd)
# Run training iterations
print("Starting Training!")
trainIters(model_name, voc, pairs, encoder, decoder, encoder_optimizer, decoder_optimizer,
embedding, encoder_n_layers, decoder_n_layers, save_dir, n_iteration, batch_size,
print_every, save_every, clip, corpus_name, loadFilename)
運(yùn)行模型沸移,與模型進(jìn)行交流
將最后一行的注釋取掉就可以與machine交流了痪伦。
# Set dropout layers to eval mode
encoder.eval()
decoder.eval()
# Initialize search module
searcher = GreedySearchDecoder(encoder, decoder)
# Begin chatting (uncomment and run the following line to begin)
# evaluateInput(encoder, decoder, searcher, voc)