數(shù)據(jù)+代碼 TensorFlow實(shí)現(xiàn)
學(xué)習(xí)視頻
一個(gè)公開(kāi)的平行語(yǔ)料庫(kù)
代碼運(yùn)行部分截圖:
以下代碼需在juypter里運(yùn)行,python3
###1.模型需要的依賴模塊們
import tensorflow as tf
import numpy as np
from sklearn.model_selection import train_test_split
import time
import matplotlib.pyplot as plt
import pickle
###2.讀取數(shù)據(jù)通贞,英文-德文翻譯
X, Y, en_word2idx, en_idx2word, en_vocab, de_word2idx, de_idx2word, de_vocab = pickle.load(open("data.pkl", 'rb'), encoding='utf-8')
###3.檢查一下數(shù)據(jù)的具體格式
print('Sentence in English - encoded:', X[0])
print('Sentence in German - encoded:', Y[0])
print('Decoded:\n------------------------')
print('英語(yǔ)句子:',end=' ')
for i in range(len(X[1])):
print(en_idx2word[X[1][i]],end=' ')
print('\n德語(yǔ)句子:',end=' ')
for i in range(len(Y[1])):
print(de_idx2word[Y[1][i]],end=' ')
###4.X,Y數(shù)據(jù)填充處理裹粤,讓兩種語(yǔ)言的句子等長(zhǎng)署鸡,并劃分出訓(xùn)練集额嘿、測(cè)試集
def data_padding(x, y, length = 15):
for i in range(len(x)):
x[i] = x[i] + (length - len(x[i])) * [en_word2idx['<pad>']]
y[i] = [de_word2idx['<go>']] + y[i] + [de_word2idx['<eos>']] + (length-len(y[i])) * [de_word2idx['<pad>']]
data_padding(X, Y)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.1)
del X
del Y
###5.搭建翻譯模型
input_seq_len = 15
output_seq_len = 17
en_vocab_size = len(en_vocab) + 2 # + <pad>, <ukn>
de_vocab_size = len(de_vocab) + 4 # + <pad>, <ukn>, <eos>, <go>
# 占位符交洗,len(encoder_inputs)=15愧哟,len(decoder_inputs)=17奥吩,len(targets)=15,len(target_weights)=17
encoder_inputs = [tf.placeholder(dtype = tf.int32, shape = [None], name = 'encoder{}'.format(i)) for i in range(input_seq_len)]
decoder_inputs = [tf.placeholder(dtype = tf.int32, shape = [None], name = 'decoder{}'.format(i)) for i in range(output_seq_len)]
targets = [decoder_inputs[i+1] for i in range(output_seq_len-1)]
targets.append(tf.placeholder(dtype = tf.int32, shape = [None], name = 'last_target'))
target_weights = [tf.placeholder(dtype = tf.float32, shape = [None], name = 'target_w{}'.format(i)) for i in range(output_seq_len)]
#輸出預(yù)測(cè)
size = 512 #德語(yǔ)的詞向量維度
w_t = tf.get_variable('proj_w', [de_vocab_size, size], tf.float32) #德語(yǔ)詞向量矩陣變量
b = tf.get_variable('proj_b', [de_vocab_size], tf.float32) #德語(yǔ)詞向量偏量
w = tf.transpose(w_t)
output_projection = (w, b)
#直接調(diào)用TensorFlow的embedding_attention_seq2seq函數(shù)
outputs, states = tf.contrib.legacy_seq2seq.embedding_attention_seq2seq(
encoder_inputs,
decoder_inputs,
tf.contrib.rnn.BasicLSTMCell(size),
num_encoder_symbols = en_vocab_size,
num_decoder_symbols = de_vocab_size,
embedding_size = 100,
feed_previous = False,
output_projection = output_projection,
dtype = tf.float32)
###6.定義模型的損失函數(shù)
# sampled softmax loss - returns: A batch_size 1-D tensor of per-example sampled softmax losses
def sampled_loss(labels, logits):
return tf.nn.sampled_softmax_loss(
weights = w_t,
biases = b,
labels = tf.reshape(labels, [-1, 1]),
inputs = logits,
num_sampled = 512,
num_classes = de_vocab_size)
# 預(yù)測(cè)序列與目標(biāo)序列的log交叉熵?fù)p失函數(shù)
loss = tf.contrib.legacy_seq2seq.sequence_loss(outputs, targets, target_weights, softmax_loss_function = sampled_loss)
###7.自定義一些需要用到的功能函數(shù)
# 自定義的softmax函數(shù)
def softmax(x):
n = np.max(x)
e_x = np.exp(x - n)
return e_x / e_x.sum()
# 自定義占位符feed函數(shù)
def feed_dict(x, y, batch_size = 64):
feed = {}
idxes = np.random.choice(len(x), size = batch_size, replace = False)
for i in range(input_seq_len):
feed[encoder_inputs[i].name] = np.array([x[j][i] for j in idxes], dtype = np.int32)
for i in range(output_seq_len):
feed[decoder_inputs[i].name] = np.array([y[j][i] for j in idxes], dtype = np.int32)
feed[targets[len(targets)-1].name] = np.full(shape = [batch_size], fill_value = de_word2idx['<pad>'], dtype = np.int32)
for i in range(output_seq_len-1):
batch_weights = np.ones(batch_size, dtype = np.float32)
target = feed[decoder_inputs[i+1].name]
for j in range(batch_size):
if target[j] == de_word2idx['<pad>']:
batch_weights[j] = 0.0
feed[target_weights[i].name] = batch_weights
feed[target_weights[output_seq_len-1].name] = np.zeros(batch_size, dtype = np.float32)
return feed
# 自定義編碼器輸出序列decode output函數(shù)
def decode_output(output_seq):
words = []
for i in range(output_seq_len):
smax = softmax(output_seq[i])
idx = np.argmax(smax)
words.append(de_idx2word[idx])
return words
# ops and hyperparameters
learning_rate = 5e-3
batch_size = 64
steps = 10 ###注:此處原始值為1000蕊梧,設(shè)置10是為了快速檢驗(yàn)?zāi)P偷目蛇\(yùn)行性
###8.模型計(jì)算
# ops for projecting outputs
outputs_proj = [tf.matmul(outputs[i], output_projection[0]) + output_projection[1] for i in range(output_seq_len)]
# training op
optimizer = tf.train.RMSPropOptimizer(learning_rate).minimize(loss)
# init op
init = tf.global_variables_initializer()
# forward step
def forward_step(sess, feed):
output_sequences = sess.run(outputs_proj, feed_dict = feed)
return output_sequences
# training step
def backward_step(sess, feed):
sess.run(optimizer, feed_dict = feed)
###9.模型保存于訓(xùn)練
losses = []
saver = tf.train.Saver() #模型保存
print('------------------TRAINING------------------')
with tf.Session() as sess:
sess.run(init)
t = time.time()
for step in range(steps):
feed = feed_dict(X_train, Y_train)
backward_step(sess, feed)
if step % 5 == 4 or step == 0:
loss_value = sess.run(loss, feed_dict = feed)
print('step: {}, loss: {}'.format(step, loss_value))
losses.append(loss_value)
if step % 20 == 19:
saver.save(sess, 'checkpoints/', global_step=step)
print('Checkpoint is saved')
print('Training time for {} steps: {}s'.format(steps, time.time() - t))
###10.畫(huà)出模型損失函數(shù)變化圖
with plt.style.context('fivethirtyeight'):
plt.plot(losses, linewidth = 1)
plt.xlabel('Steps')
plt.ylabel('Losses')
plt.ylim((0, 12))
plt.show()
###11.對(duì)訓(xùn)練好的模型進(jìn)行測(cè)試
with tf.Graph().as_default():
# placeholders
encoder_inputs = [tf.placeholder(dtype = tf.int32, shape = [None], name = 'encoder{}'.format(i)) for i in range(input_seq_len)]
decoder_inputs = [tf.placeholder(dtype = tf.int32, shape = [None], name = 'decoder{}'.format(i)) for i in range(output_seq_len)]
# output projection
size = 512
w_t = tf.get_variable('proj_w', [de_vocab_size, size], tf.float32)
b = tf.get_variable('proj_b', [de_vocab_size], tf.float32)
w = tf.transpose(w_t)
output_projection = (w, b)
# change the model so that output at time t can be fed as input at time t+1
outputs, states = tf.contrib.legacy_seq2seq.embedding_attention_seq2seq(
encoder_inputs,
decoder_inputs,
tf.contrib.rnn.BasicLSTMCell(size),
num_encoder_symbols = en_vocab_size,
num_decoder_symbols = de_vocab_size,
embedding_size = 100,
feed_previous = True, # <-----this is changed----->
output_projection = output_projection,
dtype = tf.float32)
# ops for projecting outputs
outputs_proj = [tf.matmul(outputs[i], output_projection[0]) + output_projection[1] for i in range(output_seq_len)]
# let's translate these sentences
en_sentences = ["What' s your name", 'My name is', 'What are you doing', 'I am reading a book',\
'How are you', 'I am good', 'Do you speak English', 'What time is it', 'Hi', 'Goodbye', 'Yes', 'No']
en_sentences_encoded = [[en_word2idx.get(word, 0) for word in en_sentence.split()] for en_sentence in en_sentences]
# padding to fit encoder input
for i in range(len(en_sentences_encoded)):
en_sentences_encoded[i] += (15 - len(en_sentences_encoded[i])) * [en_word2idx['<pad>']]
# restore all variables - use the last checkpoint saved
saver = tf.train.Saver()
path = tf.train.latest_checkpoint('checkpoints')
with tf.Session() as sess:
# restore
saver.restore(sess, path)
# feed data into placeholders
feed = {}
for i in range(input_seq_len):
feed[encoder_inputs[i].name] = np.array([en_sentences_encoded[j][i] for j in range(len(en_sentences_encoded))], dtype = np.int32)
feed[decoder_inputs[0].name] = np.array([de_word2idx['<go>']] * len(en_sentences_encoded), dtype = np.int32)
# translate
output_sequences = sess.run(outputs_proj, feed_dict = feed)
# decode seq.
for i in range(len(en_sentences_encoded)):
print('{}.\n--------------------------------'.format(i+1))
ouput_seq = [output_sequences[j][i] for j in range(output_seq_len)]
#decode output sequence
words = decode_output(ouput_seq)
print(en_sentences[i])
for i in range(len(words)):
if words[i] not in ['<eos>', '<pad>', '<go>']:
print(words[i],end=' ')
print('\n--------------------------------')
數(shù)據(jù)處理部分的代碼:
data.en數(shù)據(jù)格式霞赫,每一個(gè)句子為一行:
I was a Ph.D. student in clinical psychology at Berkeley.
She was a 26-year-old woman named Alex.
Now Alex walked into her first session wearing jeans and a big slouchy top, and she dropped onto the couch in my office and kicked off her flats and told me she was there to talk about guy problems.
Now when I heard this, I was so relieved.
My classmate got an arsonist for her first client.
And I got a twentysomething who wanted to talk about boys.
This I thought I could handle.
But I didn't handle it.
With the funny stories that Alex would bring to session, it was easy for me just to nod my head while we kicked the can down the road.
data.de數(shù)據(jù)格式,每一個(gè)句子為一行:
Als ich in meinen 20ern war, hatte ich meine erste Psychotherapie-Patientin.
Ich war Doktorandin und studierte Klinische Psychologie in Berkeley.
Sie war eine 26-j?hrige Frau namens Alex.
Als Alex in die erste Sitzung kam, trug sie Jeans und ein ausgebeultes Top. Sie fiel auf das Sofa in meinem Büro, schleuderte ihre Sandalen von sich und erz?hlte mir, sie w?re da, um über M?nnerprobleme zu reden.
Und als ich das h?rte, war ich erleichtert.
Meine Kommilitonin bekam n?mlich einen Brandstifter als ersten Patienten.
Und ich bekam eine Frau in den 20ern, die über Jungs reden wollte.
Das kriege ich hin, dachte ich mir.
Aber ich habe es nicht hingekriegt.
Mit den lustigen Geschichten, die Alex mit in die Sitzung brachte, war es leicht für mich, einfach mit dem Kopf zu nicken, w?hrend wir die Probleme vor uns herschoben.
以下代碼是對(duì)以上平行語(yǔ)料數(shù)據(jù)進(jìn)行處理肥矢,python3端衰,jupyter中運(yùn)行
import pickle
from collections import Counter
from operator import itemgetter
def read_sentences(file_path):
sentences = []
with open(file_path, 'r', encoding='utf-8') as reader:
for s in reader:
sentences.append(s.strip())
return sentences
def create_dataset(en_sentences, de_sentences):
en_vocab_dict = Counter(word.strip(',." ;:)(][?!') for sentence in en_sentences for word in sentence.split())
de_vocab_dict = Counter(word.strip(',." ;:)(][?!') for sentence in de_sentences for word in sentence.split())
en_vocab = list(map(lambda x: x[0], sorted(en_vocab_dict.items(), key = lambda x: -x[1])))
de_vocab = list(map(lambda x: x[0], sorted(de_vocab_dict.items(), key = lambda x: -x[1])))
# en_vocab = en_vocab[:20000]
# de_vocab = de_vocab[:30000]
start_idx = 2
en_word2idx = dict([(word, idx+start_idx) for idx, word in enumerate(en_vocab)])
en_word2idx['<ukn>'] = 0
en_word2idx['<pad>'] = 1
en_idx2word = dict([(idx, word) for word, idx in en_word2idx.items()])
start_idx = 4
de_word2idx = dict([(word, idx+start_idx) for idx, word in enumerate(de_vocab)])
de_word2idx['<ukn>'] = 0
de_word2idx['<go>'] = 1
de_word2idx['<eos>'] = 2
de_word2idx['<pad>'] = 3
de_idx2word = dict([(idx, word) for word, idx in de_word2idx.items()])
x = [[en_word2idx.get(word.strip(',." ;:)(][?!'), 0) for word in sentence.split()] for sentence in en_sentences]
y = [[de_word2idx.get(word.strip(',." ;:)(][?!'), 0) for word in sentence.split()] for sentence in de_sentences]
X = []
Y = []
for i in range(len(x)):
n1 = len(x[i])
n2 = len(y[i])
n = n1 if n1 < n2 else n2
if abs(n1 - n2) <= 0.3 * n:
if n1 <= 15 and n2 <= 15:
X.append(x[i])
Y.append(y[i])
return X, Y, en_word2idx, en_idx2word, en_vocab, de_word2idx, de_idx2word, de_vocab
def save_dataset(file_path, obj):
with open(file_path, 'wb') as f:
pickle.dump(obj, f, -1)
def read_dataset(file_path):
with open(file_path, 'rb') as f:
return pickle.load(f)
en_sentences = read_sentences('data.en')
de_sentences = read_sentences('data.de')
save_dataset('demo_data.pkl', create_dataset(en_sentences, de_sentences))