使用tensorflow進(jìn)行單詞劃分類別,依賴于sklearn自帶的已經(jīng)標(biāo)注好的數(shù)據(jù)集(有監(jiān)督學(xué)習(xí))
參考https://www.oschina.net/translate/big-picture-machine-learning?lang=chs&page=1#
完整代碼https://github.com/pursedream/tensorflow_1
- 導(dǎo)入相關(guān)包
from collections import Counter
import numpy as np
from sklearn.datasets import fetch_20newsgroups
import tensorflow as tf
import pandas as pd
- 獲取訓(xùn)練數(shù)據(jù)和測試數(shù)據(jù)
# 導(dǎo)入sklearn集合中的數(shù)據(jù)集,有監(jiān)督學(xué)習(xí),即里面的數(shù)據(jù)已經(jīng)分好了類別
# 具體參見http://scikit-learn.org/stable/datasets/twenty_newsgroups.html
categories = ["comp.graphics","sci.space","rec.sport.baseball"]
train_set = fetch_20newsgroups(subset='train',categories=categories)
test_set = fetch_20newsgroups(subset='test', categories=categories)
# print('total texts in train:',len(train_set.data))
# print('total texts in test:',len(test_set.data))
# 建立數(shù)據(jù)集單詞字典,最終形式是text_index['the'] = 數(shù)量
vocab = Counter()
for data in train_set.data:
for word in data.split(' '):
vocab[word.lower()] += 1
for test_data in test_set.data:
for word in test_data.split(' '):
vocab[word] += 1
print(len(vocab))
total_words = len(vocab)
def get_index(vocab):
# 先聲明word是字典,否則word[element]報錯
word={}
for i, element in enumerate(vocab):
word[element.lower()] = i
return word
text_index = get_index(vocab)
print("the is %s" % text_index['the'])
- 神經(jīng)網(wǎng)絡(luò)的參數(shù)設(shè)定
# 每層神經(jīng)元數(shù),包括輸入神經(jīng)元,隱藏神經(jīng)元瞧挤,輸出神經(jīng)元"comp.graphics","sci.space","rec.sport.baseball"
n_hidden1 = 100
n_hiddent2 = 100
n_input_number = total_words
n_class = 3
# 在神經(jīng)網(wǎng)絡(luò)的術(shù)語里,一次 epoch = 一個向前傳遞(得到輸出的值)和一個所有訓(xùn)練示例的向后傳遞(更新權(quán)重)儡湾。
training_epochs = 10
learning_rate = 0.01
# 批數(shù)量訓(xùn)練數(shù)據(jù)和測試數(shù)據(jù)
batch_size = 150
display_step = 1
# shape的None元素對應(yīng)于大小可變的維度
# 在測試模型時特恬,我們將用更大的批處理來提供字典,這就是為什么需要定義一個可變的批處理維度徐钠。
input_tensor = tf.placeholder(tf.float32, [None, n_input_number], name='input')
output_tensor = tf.placeholder(tf.float32, [None, n_class], name='output')
- 建立模型
# 神經(jīng)元計算
def out_prediction(input_tensor, weights, biases):
# 定義乘法運(yùn)算矩陣乘法
# relu是激活函數(shù)
layer_1_multiplication = tf.matmul(input_tensor,weights['h1'])
layer_1_addition = tf.add(layer_1_multiplication, biases['b1'])
layer_1_activation = tf.nn.relu(layer_1_addition)
layer_2_multiplication = tf.matmul(layer_1_activation, weights['h2'])
layer_2_addition = tf.add(layer_2_multiplication, biases['b2'])
layer_2_activation = tf.nn.relu(layer_2_addition)
out_layer_multiplication = tf.matmul(layer_2_activation, weights['out'])
out_layer_addition = out_layer_multiplication + biases['out']
return out_layer_addition
# shape參數(shù)含義癌刽;[]表示一個數(shù),[3]表示長為3的向量尝丐,
# [2,3]表示矩陣或者張量(tensor)同一個線性變換在不同的基下的表示
# https://www.zhihu.com/question/20695804
# 利用正態(tài)分布啟動權(quán)值和偏差值
weights = {
'h1':tf.Variable(tf.random_normal([n_input_number, n_hidden1])),
'h2':tf.Variable(tf.random_normal([n_hidden1, n_hiddent2])),
'out':tf.Variable(tf.random_normal([n_hiddent2, n_class]))
}
biases = {
'b1':tf.Variable(tf.random_normal([n_hidden1])),
'b2':tf.Variable(tf.random_normal([n_hiddent2])),
'out':tf.Variable(tf.random_normal([n_class]))
}
prediction = out_prediction(input_tensor, weights, biases)
- 由于是分類問題显拜,使用交叉熵誤差權(quán)值更新
prediction = out_prediction(input_tensor, weights, biases)
# 由于分類問題,所以使用交叉熵誤差進(jìn)行優(yōu)化爹袁,不斷更新權(quán)值和output_tensor
cross_loss = tf.nn.softmax_cross_entropy_with_logits(logits=prediction, labels=output_tensor)
loss = tf.reduce_mean(cross_loss)
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(loss)
# 數(shù)據(jù)初始化
init = tf.global_variables_initializer()
- 批處理函數(shù)
def get_batch(df, i, batch_size):
batches = []
results = []
texts = df.data[i * batch_size:i * batch_size + batch_size]
categories = df.target[i * batch_size:i * batch_size + batch_size]
# 構(gòu)建矩陣索引
for text in texts:
layer = np.zeros(total_words, dtype=float)
for word in text.split(' '):
layer[text_index[word.lower()]] += 1
batches.append(layer)
for category in categories:
y = np.zeros((3), dtype=float)
if category == 0:
y[0] = 1
elif category == 1:
y[1] = 1
else:
y[2] = 1
results.append(y)
return np.array(batches), np.array(results)
- 在Session環(huán)境中訓(xùn)練模型
with tf.Session() as sess:
sess.run(init)
# Training cycle
for epoch in range(training_epochs):
avg_cost = 0.
total_batch = int(len(train_set.data)/batch_size)
# Loop over all batches
for i in range(total_batch):
batch_x,batch_y = get_batch(train_set,i,batch_size)
# Run optimization op (backprop) and cost op (to get loss value)
# op(source op)远荠,當(dāng)運(yùn)行該函數(shù),啟動默認(rèn)圖失息,即運(yùn)行out_prediction譬淳,并不斷更新權(quán)值和分類結(jié)果
# tf.Session.run(fetches, feed_dict=None, options=None, run_metadata=None)
# feed_dict 參數(shù)是我們?yōu)槊坎竭\(yùn)行所輸入的數(shù)據(jù)。為了傳遞這個數(shù)據(jù)盹兢,我們需要定義tf.placeholders(提供給 feed_dict)
c,_ = sess.run([loss,optimizer], feed_dict={input_tensor: batch_x,output_tensor:batch_y})
# Compute average loss
avg_cost += c / total_batch
# Display logs per epoch step
if epoch % display_step == 0:
print("Epoch:", '%04d' % (epoch+1), "loss=", \
"{:.9f}".format(avg_cost))
print("Optimization Finished!")
- 利用測試數(shù)據(jù)進(jìn)行模型評價
# Test model
correct_prediction = tf.equal(tf.argmax(prediction, 1), tf.argmax(output_tensor, 1))
# Calculate accuracy
accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
total_test_data = len(train_set.target)
batch_x_test, batch_y_test = get_batch(test_set,0,total_test_data)
print("Accuracy:", accuracy.eval({input_tensor: batch_x_test, output_tensor: batch_y_test}))