首先前提是安裝了TensorFlow影涉,按照官方文檔進(jìn)行就可以膏秫,有時(shí)候網(wǎng)絡(luò)可能比較慢,有條件的話最好掛代理懈叹。注意Ubuntu本機(jī)一般是python2.7乖杠,最好使用virtualenv建立python3+的環(huán)境,否則如果導(dǎo)致本地python環(huán)境異常澄成,得不償失胧洒。在安裝好tensorflow之后,就可以進(jìn)行以下試驗(yàn)了墨状。
唐詩語料后續(xù)補(bǔ)充下載鏈接卫漫,包含4萬首唐詩,本文是基于騰訊云的一篇教程做的改進(jìn)和記錄肾砂,后面會(huì)放出詳細(xì)的鏈接列赎。教程里包括唐詩,驗(yàn)證碼識別镐确,聊天機(jī)器人等多個(gè)實(shí)驗(yàn)包吝,都是可以簡單修改跑通的例子,非常有價(jià)值源葫,有興趣可以了解下诗越。
如下代碼分為幾個(gè)部分,1.語料整理息堂,規(guī)范化嚷狞。2.訓(xùn)練:兩層RRN,使用LSTM模型訓(xùn)練荣堰。3.執(zhí)行訓(xùn)練床未,默認(rèn)的40000步。4.用于生成古詩振坚。
先貼下代碼即硼。文字處理首要的是合適的訓(xùn)練資源外加處理,這部分其實(shí)會(huì)占很大的工作量屡拨。
本地是python3.5做了簡單修改只酥。
generate_poetry.py
#-*- coding:utf-8 -*-
import numpy as np
from io import open
import sys
import collections
import imp
imp.reload(sys)
#reload(sys)
#sys.setdefaultencoding('utf8') //本地環(huán)境是python3.5做適配
class Poetry:
def __init__(self):
self.filename = "poetry"
self.poetrys = self.get_poetrys()
self.poetry_vectors,self.word_to_id,self.id_to_word = self.gen_poetry_vectors()
self.poetry_vectors_size = len(self.poetry_vectors)
self._index_in_epoch = 0
def get_poetrys(self):
poetrys = list()
f = open(self.filename,"r", encoding='utf-8')
for line in f.readlines():
_,content = line.strip('\n').strip().split(':')
content = content.replace(' ','')
#過濾含有特殊符號的唐詩
if(not content or '_' in content or '(' in content or '(' in content or "□" in content
or '《' in content or '[' in content or ':' in content or ':'in content):
continue
#過濾較長或較短的唐詩
if len(content) < 5 or len(content) > 79:
continue
content_list = content.replace(',', '|').replace('呀狼。', '|').split('|')
flag = True
#過濾即非五言也非七驗(yàn)的唐詩
for sentence in content_list:
slen = len(sentence)
if 0 == slen:
continue
if 5 != slen and 7 != slen:
flag = False
break
if flag:
#每首古詩以'['開頭裂允、']'結(jié)尾
poetrys.append('[' + content + ']')
return poetrys
def gen_poetry_vectors(self):
words = sorted(set(''.join(self.poetrys) + ' '))
#數(shù)字ID到每個(gè)字的映射
id_to_word = {i: word for i, word in enumerate(words)}
#每個(gè)字到數(shù)字ID的映射
word_to_id = {v: k for k, v in id_to_word.items()}
to_id = lambda word: word_to_id.get(word)
#唐詩向量化
poetry_vectors = [list(map(to_id, poetry)) for poetry in self.poetrys]
return poetry_vectors,word_to_id,id_to_word
def next_batch(self,batch_size):
assert batch_size < self.poetry_vectors_size
start = self._index_in_epoch
self._index_in_epoch += batch_size
#取完一輪數(shù)據(jù),打亂唐詩集合哥艇,重新取數(shù)據(jù)
if self._index_in_epoch > self.poetry_vectors_size:
np.random.shuffle(self.poetry_vectors)
start = 0
self._index_in_epoch = batch_size
end = self._index_in_epoch
batches = self.poetry_vectors[start:end]
x_batch = np.full((batch_size, max(map(len, batches))), self.word_to_id[' '], np.int32)
for row in range(batch_size):
x_batch[row,:len(batches[row])] = batches[row]
y_batch = np.copy(x_batch)
y_batch[:,:-1] = x_batch[:,1:]
y_batch[:,-1] = x_batch[:, 0]
return x_batch,y_batch
poetry_model.py
#-*- coding:utf-8 -*-
import tensorflow as tf
class poetryModel:
#定義權(quán)重和偏置項(xiàng)
def rnn_variable(self,rnn_size,words_size):
with tf.variable_scope('variable'):
w = tf.get_variable("w", [rnn_size, words_size])
b = tf.get_variable("b", [words_size])
return w,b
#損失函數(shù)
def loss_model(self,words_size,targets,logits):
targets = tf.reshape(targets,[-1])
loss = tf.contrib.legacy_seq2seq.sequence_loss_by_example([logits], [targets], [tf.ones_like(targets, dtype=tf.float32)],words_size)
loss = tf.reduce_mean(loss)
return loss
#優(yōu)化算子
def optimizer_model(self,loss,learning_rate):
tvars = tf.trainable_variables()
grads, _ = tf.clip_by_global_norm(tf.gradients(loss, tvars), 5)
train_op = tf.train.AdamOptimizer(learning_rate)
optimizer = train_op.apply_gradients(zip(grads, tvars))
return optimizer
#每個(gè)字向量化
def embedding_variable(self,inputs,rnn_size,words_size):
with tf.variable_scope('embedding'):
with tf.device("/cpu:0"):
embedding = tf.get_variable('embedding', [words_size, rnn_size])
input_data = tf.nn.embedding_lookup(embedding,inputs)
return input_data
#構(gòu)建LSTM模型
def create_model(self,inputs,batch_size,rnn_size,words_size,num_layers,is_training,keep_prob):
lstm = tf.contrib.rnn.BasicLSTMCell(num_units=rnn_size,state_is_tuple=True)
input_data = self.embedding_variable(inputs,rnn_size,words_size)
if is_training:
lstm = tf.nn.rnn_cell.DropoutWrapper(lstm, output_keep_prob=keep_prob)
input_data = tf.nn.dropout(input_data,keep_prob)
cell = tf.contrib.rnn.MultiRNNCell([lstm] * num_layers,state_is_tuple=True)
initial_state = cell.zero_state(batch_size, tf.float32)
outputs,last_state = tf.nn.dynamic_rnn(cell,input_data,initial_state=initial_state)
outputs = tf.reshape(outputs,[-1, rnn_size])
w,b = self.rnn_variable(rnn_size,words_size)
logits = tf.matmul(outputs,w) + b
probs = tf.nn.softmax(logits)
return logits,probs,initial_state,last_state
train_poetry.py
#-*- coding:utf-8 -*-
from generate_poetry import Poetry
from poetry_model import poetryModel
import tensorflow as tf
import numpy as np
if __name__ == '__main__':
batch_size = 50
epoch = 20
rnn_size = 128
num_layers = 2
poetrys = Poetry()
words_size = len(poetrys.word_to_id)
inputs = tf.placeholder(tf.int32, [batch_size, None])
targets = tf.placeholder(tf.int32, [batch_size, None])
keep_prob = tf.placeholder(tf.float32, name='keep_prob')
model = poetryModel()
logits,probs,initial_state,last_state = model.create_model(inputs,batch_size,
rnn_size,words_size,num_layers,True,keep_prob)
loss = model.loss_model(words_size,targets,logits)
learning_rate = tf.Variable(0.0, trainable=False)
optimizer = model.optimizer_model(loss,learning_rate)
saver = tf.train.Saver()
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
sess.run(tf.assign(learning_rate, 0.002 * 0.97 ))
next_state = sess.run(initial_state)
step = 0
while True:
x_batch,y_batch = poetrys.next_batch(batch_size)
feed = {inputs:x_batch,targets:y_batch,initial_state:next_state,keep_prob:0.5}
train_loss, _ ,next_state = sess.run([loss,optimizer,last_state], feed_dict=feed)
print("step:%d loss:%f" % (step,train_loss))
if step > 40000:
break
if step%1000 == 0:
n = step/1000
sess.run(tf.assign(learning_rate, 0.002 * (0.97 ** n)))
step += 1
saver.save(sess,"poetry_model.ckpt")
predicty_poetry.py
#-*- coding:utf-8 -*-
from generate_poetry import Poetry
from poetry_model import poetryModel
from operator import itemgetter
import tensorflow as tf
import numpy as np
import random
if __name__ == '__main__':
batch_size = 1
rnn_size = 128
num_layers = 2
poetrys = Poetry()
words_size = len(poetrys.word_to_id)
def to_word(prob):
prob = prob[0]
indexs, _ = zip(*sorted(enumerate(prob), key=itemgetter(1)))
rand_num = int(np.random.rand(1)*10);
index_sum = len(indexs)
max_rate = prob[indexs[(index_sum-1)]]
if max_rate > 0.9 :
sample = indexs[(index_sum-1)]
else:
sample = indexs[(index_sum-1-rand_num)]
return poetrys.id_to_word[sample]
inputs = tf.placeholder(tf.int32, [batch_size, None])
keep_prob = tf.placeholder(tf.float32, name='keep_prob')
model = poetryModel()
logits,probs,initial_state,last_state = model.create_model(inputs,batch_size,
rnn_size,words_size,num_layers,False,keep_prob)
saver = tf.train.Saver()
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
saver.restore(sess,"poetry_model.ckpt")
next_state = sess.run(initial_state)
x = np.zeros((1, 1))
x[0,0] = poetrys.word_to_id['[']
feed = {inputs: x, initial_state: next_state, keep_prob: 1}
predict, next_state = sess.run([probs, last_state], feed_dict=feed)
word = to_word(predict)
poem = ''
while word != ']':
poem += word
x = np.zeros((1, 1))
x[0, 0] = poetrys.word_to_id[word]
feed = {inputs: x, initial_state: next_state, keep_prob: 1}
predict, next_state = sess.run([probs, last_state], feed_dict=feed)
word = to_word(predict)
print(poem)
最終的效果:
龍門不可見山人绝编,日夕無情有故人。莫向西南不曾見貌踏,更應(yīng)春雨在山風(fēng)十饥。
白雪新風(fēng)月未同,山花一月一人春祖乳。風(fēng)流白日春秋月逗堵,月色青松白玉衣。
實(shí)驗(yàn)原始鏈接:https://cloud.tencent.com/developer/labs/lab/10295
本地訓(xùn)練代碼和數(shù)據(jù):https://iss.igosh.com/share/201903/rrn_poem-me.tar.gz