
數(shù)據(jù)+代碼 TensorFlow實(shí)現(xiàn)


import tensorflow as tf
import numpy as np
from sklearn.model_selection import train_test_split
import time
import matplotlib.pyplot as plt
import pickle

X, Y, en_word2idx, en_idx2word, en_vocab, de_word2idx, de_idx2word, de_vocab = pickle.load(open("data.pkl", 'rb'), encoding='utf-8')

print('Sentence in English - encoded:', X[0])
print('Sentence in German - encoded:', Y[0])

print('英語(yǔ)句子:',end=' ')
for i in range(len(X[1])):
    print(en_idx2word[X[1][i]],end=' ')
print('\n德語(yǔ)句子:',end=' ')
for i in range(len(Y[1])):
    print(de_idx2word[Y[1][i]],end=' ')

def data_padding(x, y, length = 15):
    for i in range(len(x)):
        x[i] = x[i] + (length - len(x[i])) * [en_word2idx['<pad>']]
        y[i] = [de_word2idx['<go>']] + y[i] + [de_word2idx['<eos>']] + (length-len(y[i])) * [de_word2idx['<pad>']]

data_padding(X, Y)
X_train,  X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.1)

del X
del Y

input_seq_len = 15
output_seq_len = 17
en_vocab_size = len(en_vocab) + 2 # + <pad>, <ukn>
de_vocab_size = len(de_vocab) + 4 # + <pad>, <ukn>, <eos>, <go>

# 占位符交洗,len(encoder_inputs)=15愧哟,len(decoder_inputs)=17奥吩,len(targets)=15,len(target_weights)=17
encoder_inputs = [tf.placeholder(dtype = tf.int32, shape = [None], name = 'encoder{}'.format(i)) for i in range(input_seq_len)]
decoder_inputs = [tf.placeholder(dtype = tf.int32, shape = [None], name = 'decoder{}'.format(i)) for i in range(output_seq_len)]
targets = [decoder_inputs[i+1] for i in range(output_seq_len-1)]
targets.append(tf.placeholder(dtype = tf.int32, shape = [None], name = 'last_target'))
target_weights = [tf.placeholder(dtype = tf.float32, shape = [None], name = 'target_w{}'.format(i)) for i in range(output_seq_len)]

size = 512  #德語(yǔ)的詞向量維度
w_t = tf.get_variable('proj_w', [de_vocab_size, size], tf.float32)  #德語(yǔ)詞向量矩陣變量
b = tf.get_variable('proj_b', [de_vocab_size], tf.float32) #德語(yǔ)詞向量偏量
w = tf.transpose(w_t)
output_projection = (w, b)

outputs, states = tf.contrib.legacy_seq2seq.embedding_attention_seq2seq(
                                            num_encoder_symbols = en_vocab_size,
                                            num_decoder_symbols = de_vocab_size,
                                            embedding_size = 100,
                                            feed_previous = False,
                                            output_projection = output_projection,
                                            dtype = tf.float32)

# sampled softmax loss - returns: A batch_size 1-D tensor of per-example sampled softmax losses
def sampled_loss(labels, logits):
    return tf.nn.sampled_softmax_loss(
                        weights = w_t,
                        biases = b,
                        labels = tf.reshape(labels, [-1, 1]),
                        inputs = logits,
                        num_sampled = 512,
                        num_classes = de_vocab_size)

# 預(yù)測(cè)序列與目標(biāo)序列的log交叉熵?fù)p失函數(shù)
loss = tf.contrib.legacy_seq2seq.sequence_loss(outputs, targets, target_weights, softmax_loss_function = sampled_loss)

# 自定義的softmax函數(shù)
def softmax(x):
    n = np.max(x)
    e_x = np.exp(x - n)
    return e_x / e_x.sum()

# 自定義占位符feed函數(shù)
def feed_dict(x, y, batch_size = 64):
    feed = {}
    idxes = np.random.choice(len(x), size = batch_size, replace = False)
    for i in range(input_seq_len):
        feed[encoder_inputs[i].name] = np.array([x[j][i] for j in idxes], dtype = np.int32)
    for i in range(output_seq_len):
        feed[decoder_inputs[i].name] = np.array([y[j][i] for j in idxes], dtype = np.int32)
    feed[targets[len(targets)-1].name] = np.full(shape = [batch_size], fill_value = de_word2idx['<pad>'], dtype = np.int32)
    for i in range(output_seq_len-1):
        batch_weights = np.ones(batch_size, dtype = np.float32)
        target = feed[decoder_inputs[i+1].name]
        for j in range(batch_size):
            if target[j] == de_word2idx['<pad>']:
                batch_weights[j] = 0.0
        feed[target_weights[i].name] = batch_weights
    feed[target_weights[output_seq_len-1].name] = np.zeros(batch_size, dtype = np.float32)
    return feed

# 自定義編碼器輸出序列decode output函數(shù)
def decode_output(output_seq):
    words = []
    for i in range(output_seq_len):
        smax = softmax(output_seq[i])
        idx = np.argmax(smax)
    return words

# ops and hyperparameters
learning_rate = 5e-3
batch_size = 64
steps = 10 ###注:此處原始值為1000蕊梧,設(shè)置10是為了快速檢驗(yàn)?zāi)P偷目蛇\(yùn)行性

# ops for projecting outputs
outputs_proj = [tf.matmul(outputs[i], output_projection[0]) + output_projection[1] for i in range(output_seq_len)]
# training op
optimizer = tf.train.RMSPropOptimizer(learning_rate).minimize(loss)
# init op
init = tf.global_variables_initializer()
# forward step
def forward_step(sess, feed):
    output_sequences = sess.run(outputs_proj, feed_dict = feed)
    return output_sequences
# training step
def backward_step(sess, feed):
    sess.run(optimizer, feed_dict = feed)

losses = []
saver = tf.train.Saver() #模型保存

with tf.Session() as sess:
    t = time.time()
    for step in range(steps):
        feed = feed_dict(X_train, Y_train)
        backward_step(sess, feed)
        if step % 5 == 4 or step == 0:
            loss_value = sess.run(loss, feed_dict = feed)
            print('step: {}, loss: {}'.format(step, loss_value))
        if step % 20 == 19:
            saver.save(sess, 'checkpoints/', global_step=step)
            print('Checkpoint is saved')

    print('Training time for {} steps: {}s'.format(steps, time.time() - t))

with plt.style.context('fivethirtyeight'):
    plt.plot(losses, linewidth = 1)
    plt.ylim((0, 12))

with tf.Graph().as_default():
    # placeholders
    encoder_inputs = [tf.placeholder(dtype = tf.int32, shape = [None], name = 'encoder{}'.format(i)) for i in range(input_seq_len)]
    decoder_inputs = [tf.placeholder(dtype = tf.int32, shape = [None], name = 'decoder{}'.format(i)) for i in range(output_seq_len)]
    # output projection
    size = 512
    w_t = tf.get_variable('proj_w', [de_vocab_size, size], tf.float32)
    b = tf.get_variable('proj_b', [de_vocab_size], tf.float32)
    w = tf.transpose(w_t)
    output_projection = (w, b)
    # change the model so that output at time t can be fed as input at time t+1
    outputs, states = tf.contrib.legacy_seq2seq.embedding_attention_seq2seq(
                                                num_encoder_symbols = en_vocab_size,
                                                num_decoder_symbols = de_vocab_size,
                                                embedding_size = 100,
                                                feed_previous = True, # <-----this is changed----->
                                                output_projection = output_projection,
                                                dtype = tf.float32)
    # ops for projecting outputs
    outputs_proj = [tf.matmul(outputs[i], output_projection[0]) + output_projection[1] for i in range(output_seq_len)]

    # let's translate these sentences     
    en_sentences = ["What' s your name", 'My name is', 'What are you doing', 'I am reading a book',\
                    'How are you', 'I am good', 'Do you speak English', 'What time is it', 'Hi', 'Goodbye', 'Yes', 'No']
    en_sentences_encoded = [[en_word2idx.get(word, 0) for word in en_sentence.split()] for en_sentence in en_sentences]
    # padding to fit encoder input
    for i in range(len(en_sentences_encoded)):
        en_sentences_encoded[i] += (15 - len(en_sentences_encoded[i])) * [en_word2idx['<pad>']]
    # restore all variables - use the last checkpoint saved
    saver = tf.train.Saver()
    path = tf.train.latest_checkpoint('checkpoints')
    with tf.Session() as sess:
        # restore
        saver.restore(sess, path)
        # feed data into placeholders
        feed = {}
        for i in range(input_seq_len):
            feed[encoder_inputs[i].name] = np.array([en_sentences_encoded[j][i] for j in range(len(en_sentences_encoded))], dtype = np.int32)
        feed[decoder_inputs[0].name] = np.array([de_word2idx['<go>']] * len(en_sentences_encoded), dtype = np.int32)
        # translate
        output_sequences = sess.run(outputs_proj, feed_dict = feed)
        # decode seq.
        for i in range(len(en_sentences_encoded)):
            ouput_seq = [output_sequences[j][i] for j in range(output_seq_len)]
            #decode output sequence
            words = decode_output(ouput_seq)
            for i in range(len(words)):
                if words[i] not in ['<eos>', '<pad>', '<go>']:
                    print(words[i],end=' ')



I was a Ph.D. student in clinical psychology at Berkeley. 
She was a 26-year-old woman named Alex. 
Now Alex walked into her first session wearing jeans and a big slouchy top, and she dropped onto the couch in my office and kicked off her flats and told me she was there to talk about guy problems. 
Now when I heard this, I was so relieved. 
My classmate got an arsonist for her first client. 
And I got a twentysomething who wanted to talk about boys. 
This I thought I could handle. 
But I didn't handle it. 
With the funny stories that Alex would bring to session, it was easy for me just to nod my head while we kicked the can down the road.


Als ich in meinen 20ern war, hatte ich meine erste Psychotherapie-Patientin. 
Ich war Doktorandin und studierte Klinische Psychologie in Berkeley. 
Sie war eine 26-j?hrige Frau namens Alex. 
Als Alex in die erste Sitzung kam, trug sie Jeans und ein ausgebeultes Top. Sie fiel auf das Sofa in meinem Büro, schleuderte ihre Sandalen von sich und erz?hlte mir, sie w?re da, um über M?nnerprobleme zu reden. 
Und als ich das h?rte, war ich erleichtert. 
Meine Kommilitonin bekam n?mlich einen Brandstifter als ersten Patienten. 
Und ich bekam eine Frau in den 20ern, die über Jungs reden wollte. 
Das kriege ich hin, dachte ich mir. 
Aber ich habe es nicht hingekriegt. 
Mit den lustigen Geschichten, die Alex mit in die Sitzung brachte, war es leicht für mich, einfach mit dem Kopf zu nicken, w?hrend wir die Probleme vor uns herschoben. 


import pickle
from collections import Counter
from operator import itemgetter

def read_sentences(file_path):
    sentences = []
    with open(file_path, 'r', encoding='utf-8') as reader:
        for s in reader:

    return sentences

def create_dataset(en_sentences, de_sentences):

    en_vocab_dict = Counter(word.strip(',." ;:)(][?!') for sentence in en_sentences for word in sentence.split())
    de_vocab_dict = Counter(word.strip(',." ;:)(][?!') for sentence in de_sentences for word in sentence.split())

    en_vocab = list(map(lambda x: x[0], sorted(en_vocab_dict.items(), key = lambda x: -x[1])))
    de_vocab = list(map(lambda x: x[0], sorted(de_vocab_dict.items(), key = lambda x: -x[1])))

#   en_vocab = en_vocab[:20000]
#   de_vocab = de_vocab[:30000]

    start_idx = 2
    en_word2idx = dict([(word, idx+start_idx) for idx, word in enumerate(en_vocab)])
    en_word2idx['<ukn>'] = 0
    en_word2idx['<pad>'] = 1

    en_idx2word = dict([(idx, word) for word, idx in en_word2idx.items()])

    start_idx = 4
    de_word2idx = dict([(word, idx+start_idx) for idx, word in enumerate(de_vocab)])
    de_word2idx['<ukn>'] = 0
    de_word2idx['<go>']  = 1
    de_word2idx['<eos>'] = 2
    de_word2idx['<pad>'] = 3

    de_idx2word = dict([(idx, word) for word, idx in de_word2idx.items()])

    x = [[en_word2idx.get(word.strip(',." ;:)(][?!'), 0) for word in sentence.split()] for sentence in en_sentences]
    y = [[de_word2idx.get(word.strip(',." ;:)(][?!'), 0) for word in sentence.split()] for sentence in de_sentences]

    X = []
    Y = []
    for i in range(len(x)):
        n1 = len(x[i])
        n2 = len(y[i])
        n = n1 if n1 < n2 else n2 
        if abs(n1 - n2) <= 0.3 * n:
            if n1 <= 15 and n2 <= 15:

    return X, Y, en_word2idx, en_idx2word, en_vocab, de_word2idx, de_idx2word, de_vocab

def save_dataset(file_path, obj):
    with open(file_path, 'wb') as f:
        pickle.dump(obj, f, -1)

def read_dataset(file_path):
    with open(file_path, 'rb') as f:
        return pickle.load(f)

en_sentences = read_sentences('data.en')
de_sentences = read_sentences('data.de')
save_dataset('demo_data.pkl', create_dataset(en_sentences, de_sentences))
