Chapter1.代碼詳解
完整代碼github鏈接晌涕,Untitled.ipynb文件內(nèi)员辩。修復(fù)了網(wǎng)上一些代碼的bug,解決了由于tensorflow版本不同引起的一些問題逆日。
【里面的測試是還沒訓(xùn)練完的時候測試的嵌巷,今晚會更新訓(xùn)練完成后的測試結(jié)果】
數(shù)據(jù)集鏈接,下載數(shù)據(jù)集后屏富,解壓提取dgk_shooter_min.conv文件晴竞,最好進(jìn)行轉(zhuǎn)碼操作蛙卤。建議用記事本打開后將其另存為狠半,選擇編碼為utf-8后進(jìn)行保存。
(1)數(shù)據(jù)預(yù)處理
#coding=utf-8
#(1)數(shù)據(jù)預(yù)處理
import os
import random
from io import open
conv_path = 'dgk_shooter_min.conv.txt'
#判斷數(shù)據(jù)集是否存在颤难?
if not os.path.exists(conv_path):
print('數(shù)據(jù)集不存在')
exit()
# 數(shù)據(jù)集格式
"""
E
M 畹/華/吾/侄/
M 你/接/到/這/封/信/的/時/候/
M 不/知/道/大/伯/還/在/不/在/人/世/了/
E
M 咱/們/梅/家/從/你/爺/爺/起/
M 就/一/直/小/心/翼/翼/地/唱/戲/
M 侍/奉/宮/廷/侍/奉/百/姓/
M 從/來/不/曾/遭/此/大/禍/
M 太/后/的/萬/壽/節(jié)/誰/敢/不/穿/紅/
M 就/你/膽/兒/大/
M 唉/這/我/舅/母/出/殯/
M 我/不/敢/穿/紅/啊/
M 唉/呦/唉/呦/爺/
M 您/打/得/好/我/該/打/
M 就/因/為/沒/穿/紅/讓/人/賞/咱/一/紙/枷/鎖/
M 爺/您/別/給/我/戴/這/紙/枷/鎖/呀/
E
M 您/多/打/我/幾/下/不/就/得/了/嗎/
M 走/
M 這/是/哪/一/出/啊/…/ / /這/是/
M 撕/破/一/點(diǎn)/就/弄/死/你/
M 唉/
M 記/著/唱/戲/的/再/紅/
M 還/是/讓/人/瞧/不/起/
M 大/伯/不/想/讓/你/挨/了/打/
M 還/得/跟/人/家/說/打/得/好/
M 大/伯/不/想/讓/你/再/戴/上/那/紙/枷/鎖/
M 畹/華/開/開/門/哪/
E
...
"""
# 我首先使用文本編輯器sublime把dgk_shooter_min.conv文件編碼轉(zhuǎn)為UTF-8神年,一下子省了不少麻煩
convs = [] # 對話集合
with open(conv_path, encoding="utf8") as f:
one_conv = [] # 一次完整對話
for line in f:
line = line.strip('\n').replace('/', '')#將分隔符去掉
if line == '':
continue
if line[0] == 'E':
if one_conv:
convs.append(one_conv)
one_conv = []
elif line[0] == 'M':
one_conv.append(line.split(' ')[1])
#將對話轉(zhuǎn)成utf-8格式,并將其保存在dgk_shooter_min.conv文件中
print(convs[:3]) # 個人感覺對白數(shù)據(jù)集有點(diǎn)不給力啊
#[ ['畹華吾侄', '你接到這封信的時候', '不知道大伯還在不在人世了'],
# ['咱們梅家從你爺爺起', '就一直小心翼翼地唱戲', '侍奉宮廷侍奉百姓', '從來不曾遭此大禍', '太后的萬壽節(jié)誰敢不穿紅', '就你膽兒大', '唉這我舅母出殯', '我不敢穿紅啊', '唉呦唉呦爺', '您打得好我該打', '就因為沒穿紅讓人賞咱一紙枷鎖', '爺您別給我戴這紙枷鎖呀'],
# ['您多打我?guī)紫虏痪偷昧藛?, '走', '這是哪一出啊 ', '撕破一點(diǎn)就弄死你', '唉', '記著唱戲的再紅', '還是讓人瞧不起', '大伯不想讓你挨了打', '還得跟人家說打得好', '大伯不想讓你再戴上那紙枷鎖', '畹華開開門哪'], ....]
# 把對話分成問與答
ask = [] # 問
response = [] # 答
for conv in convs:
if len(conv) == 1:
continue
if len(conv) % 2 != 0: # 奇數(shù)對話數(shù), 轉(zhuǎn)為偶數(shù)對話
conv = conv[:-1]
for i in range(len(conv)):
if i % 2 == 0:
ask.append(conv[i])#偶數(shù)對行嗤,填寫問題
else:
response.append(conv[i])#回答
print(len(ask), len(response))
print(ask[:3])
print(response[:3])
#['畹華吾侄', '咱們梅家從你爺爺起', '侍奉宮廷侍奉百姓']
#['你接到這封信的時候', '就一直小心翼翼地唱戲', '從來不曾遭此大禍']
def convert_seq2seq_files(questions, answers, TESTSET_SIZE=8000):
# 創(chuàng)建文件
train_enc = open('train.enc', 'w',encoding='utf-8') # 問
train_dec = open('train.dec', 'w',encoding='utf-8') # 答
test_enc = open('test.enc', 'w',encoding='utf-8') # 問
test_dec = open('test.dec', 'w',encoding='utf-8') # 答
# 選擇8000數(shù)據(jù)作為測試數(shù)據(jù)
test_index = random.sample([i for i in range(len(questions))], TESTSET_SIZE)
for i in range(len(questions)):
if i in test_index:#創(chuàng)建測試文件
test_enc.write(questions[i] + '\n')
test_dec.write(answers[i] + '\n')
else:#創(chuàng)建訓(xùn)練文件
train_enc.write(questions[i] + '\n')
train_dec.write(answers[i] + '\n')
if i % 1000 == 0:#表示處理了多少個i
print(len(range(len(questions))), '處理進(jìn)度:', i)
train_enc.close()
train_dec.close()
test_enc.close()
test_dec.close()
convert_seq2seq_files(ask, response)
# 生成的*.enc文件保存了問題
# 生成的*.dec文件保存了回答
將數(shù)據(jù)集進(jìn)行處理后分成問與答的形式進(jìn)行保存已日,選擇其中的8000數(shù)據(jù)作為測試數(shù)據(jù)。處理完畢后生成的.enc文件保存了問題,.dec文件保存了回答栅屏。
問題文件*.enc預(yù)覽:
爺爺您戲改得真好
您怎么不進(jìn)去呀
王老板
見過
地球再也無法承受人類的數(shù)量
我現(xiàn)在是和摩蘭達(dá)說話嗎飘千?
我們不是告訴他們應(yīng)該想什么
回答文件*.dec預(yù)覽:
這回跟您可真是一棵菜了
我等人拿鑰匙呢
唉
什么事
我們發(fā)現(xiàn)了一個新的太陽系
不是
我們僅僅是想告訴他們應(yīng)該怎么想
(2)創(chuàng)建詞匯表
#coding=utf-8
#(2)創(chuàng)建詞匯表
# 前一步生成的問答文件路徑
train_encode_file = 'train.enc'
train_decode_file = 'train.dec'
test_encode_file = 'test.enc'
test_decode_file = 'test.dec'
print('開始創(chuàng)建詞匯表...')
# 特殊標(biāo)記,用來填充標(biāo)記對話
PAD = "__PAD__"
GO = "__GO__"
EOS = "__EOS__" # 對話結(jié)束
UNK = "__UNK__" # 標(biāo)記未出現(xiàn)在詞匯表中的字符
START_VOCABULART = [PAD, GO, EOS, UNK]
PAD_ID = 0
GO_ID = 1
EOS_ID = 2
UNK_ID = 3
# 參看tensorflow.models.rnn.translate.data_utils
vocabulary_size = 5000
# 生成詞匯表文件
def gen_vocabulary_file(input_file, output_file):
vocabulary = {}
with open(input_file, encoding="utf8") as f:
counter = 0
for line in f:
counter += 1
tokens = [word for word in line.strip()]
for word in tokens:
if word in vocabulary:
vocabulary[word] += 1
else:
vocabulary[word] = 1
vocabulary_list = START_VOCABULART + sorted(vocabulary, key=vocabulary.get, reverse=True)
# 取前5000個常用漢字, 應(yīng)該差不多夠用了(額, 好多無用字符, 最好整理一下. 我就不整理了)
if len(vocabulary_list) > 5000:
vocabulary_list = vocabulary_list[:5000]
print(input_file + " 詞匯表大小:", len(vocabulary_list))
with open(output_file, "w", encoding="utf8") as ff:
for word in vocabulary_list:
ff.write(word + "\n")
gen_vocabulary_file(train_encode_file, "train_encode_vocabulary")
gen_vocabulary_file(train_decode_file, "train_decode_vocabulary")
train_encode_vocabulary_file = 'train_encode_vocabulary'
train_decode_vocabulary_file = 'train_decode_vocabulary'
print("對話轉(zhuǎn)向量...")
# 把對話字符串轉(zhuǎn)為向量形式
def convert_to_vector(input_file, vocabulary_file, output_file):
tmp_vocab = []
with open(vocabulary_file, "r", encoding="utf8") as f:
tmp_vocab.extend(f.readlines())
tmp_vocab = [line.strip() for line in tmp_vocab]
vocab = dict([(x, y) for (y, x) in enumerate(tmp_vocab)])
# {'碩': 3142, 'v': 577, 'I': 4789, '\ue796': 4515, '拖': 1333, '疤': 2201 ...}
output_f = open(output_file, 'w')
with open(input_file, 'r', encoding="utf8") as f:
for line in f:
line_vec = []
for words in line.strip():
line_vec.append(vocab.get(words, UNK_ID))
output_f.write(" ".join([str(num) for num in line_vec]) + "\n")
output_f.close()
convert_to_vector(train_encode_file, train_encode_vocabulary_file, 'train_encode.vec')
convert_to_vector(train_decode_file, train_decode_vocabulary_file, 'train_decode.vec')
convert_to_vector(test_encode_file, train_encode_vocabulary_file, 'test_encode.vec')
convert_to_vector(test_decode_file, train_decode_vocabulary_file, 'test_decode.vec')
提取前5000個常用的漢字創(chuàng)建詞匯表
詞匯表文件*_vocabulary預(yù)覽:
__PAD__
__GO__
__EOS__
__UNK__
我
的
你
是
栈雳,
不
了
們
對話轉(zhuǎn)向量护奈,把對話字符串轉(zhuǎn)為向量形式
向量文件*.vec預(yù)覽:
6 269 31 13 1022 157 5 60 190
28 14 226 92 113 2047 2047 98 909 724
137 22 9 644 1331 278 63 1685
28 6 1363 118 63
4 9 652 514 824 88
433 131 51 24 4 127 131
1093 433 94 81 4 884 13 840 3435 1010 366
生成的train_encode.vec和train_decode.vec用于訓(xùn)練,對應(yīng)的詞匯表train_encode_vocabulary和train_decode_vocabulary哥纫。
(3)訓(xùn)練
這里選取部分代碼進(jìn)行講解霉旗,完整代碼鏈接
導(dǎo)入向量文件進(jìn)行訓(xùn)練,定義一個read_data的函數(shù)對訓(xùn)練集與測試集的問題向量文件encode.vec蛀骇,回答向量文件decode.vec厌秒,進(jìn)行讀取。
讀取的時候?qū)栴}向量文件encode.vec中的每一行默認(rèn)以空格為分隔符擅憔,構(gòu)成一個源序列鸵闪。將回答向量文件decode.vec中的每一行默認(rèn)以空格為分隔符,構(gòu)成一個目標(biāo)序列暑诸。然后將兩個序列添加到data_set中岛马。對文件中的每一行都進(jìn)行處理與添加后棉姐,將得到的data_set返回。
# 讀取*encode.vec和*decode.vec數(shù)據(jù)(數(shù)據(jù)還不算太多, 一次讀入到內(nèi)存)
def read_data(source_path, target_path, max_size=None):
data_set = [[] for _ in buckets]#生成了[[],[],[],[]],即當(dāng)值與參數(shù)不一樣
with tf.gfile.GFile(source_path, mode="r") as source_file:#以讀格式打開源文件(source_file)
with tf.gfile.GFile(target_path, mode="r") as target_file:#以讀格式打開目標(biāo)文件
source, target = source_file.readline(), target_file.readline()#只讀取一行
counter = 0#計數(shù)器為0
while source and target and ( not max_size or counter < max_size):#當(dāng)讀入的還存在時
counter += 1
source_ids = [int(x) for x in source.split()]#source的目標(biāo)序列號啦逆,默認(rèn)分隔符為空格伞矩,組成了一個源序列
target_ids = [int(x) for x in target.split()]#target組成一個目標(biāo)序列,為目標(biāo)序列
target_ids.append(EOS_ID)#加上結(jié)束標(biāo)記的序列號
for bucket_id, (source_size, target_size) in enumerate(buckets):#enumerate()遍歷序列中的元素和其下標(biāo)
if len(source_ids) < source_size and len(target_ids) < target_size:#判斷是否超越了最大長度
data_set[bucket_id].append([source_ids, target_ids])#讀取到數(shù)據(jù)集文件中區(qū)
break#一次即可夏志,跳出當(dāng)前循環(huán)
source, target = source_file.readline(), target_file.readline()#讀取了下一行
return data_set
構(gòu)建模型
model = seq2seq_model.Seq2SeqModel(source_vocab_size=vocabulary_encode_size, target_vocab_size=vocabulary_decode_size,
buckets=buckets, size=layer_size, num_layers=num_layers, max_gradient_norm=5.0,
batch_size=batch_size, learning_rate=0.5, learning_rate_decay_factor=0.97,
forward_only=False)
開始訓(xùn)練
with tf.Session(config=config) as sess:
# 恢復(fù)前一次訓(xùn)練
ckpt = tf.train.get_checkpoint_state('.')
if ckpt != None:
print(ckpt.model_checkpoint_path)
model.saver.restore(sess, ckpt.model_checkpoint_path)
else:
sess.run(tf.global_variables_initializer())
train_set = read_data(train_encode_vec, train_decode_vec)
test_set = read_data(test_encode_vec, test_decode_vec)
train_bucket_sizes = [len(train_set[b]) for b in range(len(buckets))]#分別計算出訓(xùn)練集中的長度【1,2,3,4】
train_total_size = float(sum(train_bucket_sizes))#訓(xùn)練實例總數(shù)
train_buckets_scale = [sum(train_bucket_sizes[:i + 1]) / train_total_size for i in range(len(train_bucket_sizes))]#計算了之前所有的數(shù)的首戰(zhàn)百分比
loss = 0.0#損失置位0
total_step = 0
previous_losses = []
# 一直訓(xùn)練乃坤,每過一段時間保存一次模型
while True:
random_number_01 = np.random.random_sample()#每一次循環(huán)結(jié)果不一樣
#選出最小的大于隨機(jī)采樣的值的索引號
bucket_id = min([i for i in range(len(train_buckets_scale)) if train_buckets_scale[i] > random_number_01])
encoder_inputs, decoder_inputs, target_weights = model.get_batch(train_set, bucket_id)
#get_batch()函數(shù)首先獲取bucket的encoder_size與decoder_size
_, step_loss, _ = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, False)#損失
loss += step_loss / 500
total_step += 1
print(total_step)
if total_step % 500 == 0:
print(model.global_step.eval(), model.learning_rate.eval(), loss)
# 如果模型沒有得到提升,減小learning rate
if len(previous_losses) > 2 and loss > max(previous_losses[-3:]):#即損失比以前的大則降低學(xué)習(xí)率
sess.run(model.learning_rate_decay_op)
previous_losses.append(loss)
# 保存模型
checkpoint_path = "./chatbot_seq2seq.ckpt"
model.saver.save(sess, checkpoint_path, global_step=model.global_step)
#返回路徑checkpoint_file = "%s-%s" % (save_path, "{:08d}".format(global_step))
loss = 0.0#置當(dāng)前損失為0
# 使用測試數(shù)據(jù)評估模型
for bucket_id in range(len(buckets)):
if len(test_set[bucket_id]) == 0:
continue
#獲取當(dāng)前bucket的encoder_inputs, decoder_inputs, target_weights
encoder_inputs, decoder_inputs, target_weights = model.get_batch(test_set, bucket_id)
#計算bucket_id的損失權(quán)重
_, eval_loss, _ = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True)
eval_ppx = math.exp(eval_loss) if eval_loss < 300 else float('inf')
print(bucket_id, eval_ppx)#輸出的是bucket_id與eval_ppx
(4)模型測試
#coding=utf-8
#(4)使用訓(xùn)練好的模型
import tensorflow as tf # 0.12
# from tensorflow.models.rnn.translate import seq2seq_model
from tensorflow.models.tutorials.rnn.chatbot import seq2seq_model#注意 seq2seq_model這個需要自己去下載沟蔑,根據(jù)自己的路徑進(jìn)行導(dǎo)入
# 本人將seq2seq_model模塊下載后 復(fù)制到tensorflow/models/tutorials/rnn/chatbot/路徑下湿诊,所以才這樣進(jìn)行導(dǎo)入
import os
import numpy as np
PAD_ID = 0
GO_ID = 1
EOS_ID = 2
UNK_ID = 3
tf.reset_default_graph()
#詞匯表路徑path
train_encode_vocabulary = 'train_encode_vocabulary'
train_decode_vocabulary = 'train_decode_vocabulary'
#讀取詞匯表
def read_vocabulary(input_file):
tmp_vocab = []
with open(input_file, "r",encoding='utf-8') as f:
tmp_vocab.extend(f.readlines())#打開的文件全部讀入input_file中
tmp_vocab = [line.strip() for line in tmp_vocab]#轉(zhuǎn)換成列表
vocab = dict([(x, y) for (y, x) in enumerate(tmp_vocab)])
return vocab, tmp_vocab#返回字典,列表
vocab_en, _, = read_vocabulary(train_encode_vocabulary)#得到詞匯字典
_, vocab_de, = read_vocabulary(train_decode_vocabulary)#得到詞匯列表
# 詞匯表大小5000
vocabulary_encode_size = 5000
vocabulary_decode_size = 5000
buckets = [(5, 10), (10, 15), (20, 25), (40, 50)]
layer_size = 256 # 每層大小
num_layers = 3 # 層數(shù)
batch_size = 1
model = seq2seq_model.Seq2SeqModel(source_vocab_size=vocabulary_encode_size, target_vocab_size=vocabulary_decode_size,
buckets=buckets, size=layer_size, num_layers=num_layers, max_gradient_norm=5.0,
batch_size=batch_size, learning_rate=0.5, learning_rate_decay_factor=0.99,
forward_only=True)
#模型說明:源,目標(biāo)詞匯尺寸=vocabulary_encode(decode)_size;batch_size:訓(xùn)練期間使用的批次的大小;#forward_only:僅前向不傳遞誤差
model.batch_size = 1#batch_size=1
with tf.Session() as sess:#打開作為一次會話
# 恢復(fù)前一次訓(xùn)練
ckpt = tf.train.get_checkpoint_state('.')#從檢查點(diǎn)文件中返回一個狀態(tài)(ckpt)
#如果ckpt存在瘦材,輸出模型路徑
if ckpt != None:
print(ckpt.model_checkpoint_path)
model.saver.restore(sess, ckpt.model_checkpoint_path)#儲存模型參數(shù)
else:
print("沒找到模型")
#測試該模型的能力
while True:
input_string = input('me > ')
# 退出
if input_string == 'quit':
exit()
input_string_vec = []#輸入字符串向量化
for words in input_string.strip():
input_string_vec.append(vocab_en.get(words, UNK_ID))#get()函數(shù):如果words在詞表中厅须,返回索引號;否則食棕,返回UNK_ID
bucket_id = min([b for b in range(len(buckets)) if buckets[b][0] > len(input_string_vec)])#保留最小的大于輸入的bucket的id
encoder_inputs, decoder_inputs, target_weights = model.get_batch({bucket_id: [(input_string_vec, [])]}, bucket_id)
#get_batch(A,B):兩個參數(shù)朗和,A為大小為len(buckets)的元組,返回了指定bucket_id的encoder_inputs,decoder_inputs,target_weights
_, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True)
#得到其輸出
outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits]#求得最大的預(yù)測范圍列表
if EOS_ID in outputs:#如果EOS_ID在輸出內(nèi)部簿晓,則輸出列表為[,,,,:End]
outputs = outputs[:outputs.index(EOS_ID)]
response = "".join([tf.compat.as_str(vocab_de[output]) for output in outputs])#轉(zhuǎn)為解碼詞匯分別添加到回復(fù)中
print('AI--PigPig > ' + response)#輸出回復(fù)
測試結(jié)果:
以下為訓(xùn)練5500步后的測試結(jié)果:
【最終結(jié)果有待更新】
傲嬌屬性get
訓(xùn)練10000步后眶拉,開始變得可愛了 ^ _ ^