#!usr/bin/python
# coding=utf-8
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from keras.models import Model
from keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding
from keras.optimizers import RMSprop
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.callbacks import EarlyStopping
# ## 設(shè)置字體
# from matplotlib.font_manager import FontProperties
# # fonts = FontProperties(fname = "/Library/Fonts/華文細(xì)黑.ttf",size=14)
# # %config InlineBackend.figure_format = 'retina'
# %matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
# 畫(huà)圖支持中文顯示
from pylab import *
matplotlib.rcParams['font.sans-serif'] = ['SimHei']
# 顯示所有列
pd.set_option('display.max_columns', None)
# 顯示所有行
pd.set_option('display.max_rows', None)
# 設(shè)置value的顯示長(zhǎng)度為10000,默認(rèn)為50
pd.set_option('display.width',10000)
pd.set_option('display.unicode.ambiguous_as_wide', True)
pd.set_option('display.unicode.east_asian_width', True)
#
np.set_printoptions(linewidth=1000)
## 讀取測(cè)數(shù)據(jù)集
train_df = pd.read_csv("G:\\rnn\lstm\cnews-LSTM\cnews_train.csv")
val_df = pd.read_csv("G:\\rnn\lstm\cnews-LSTM\cnews_val.csv")
test_df = pd.read_csv("G:\\rnn\lstm\cnews-LSTM\cnews_test.csv")
# print(train_df.head())
# print(val_df.head())
# print(test_df.head())
print(train_df.iloc[1:2,[2]])
print(type(train_df.iloc[1:2,[2]]))
# print(len((train_df.iloc[1:2,[2]]).values), (train_df.iloc[1:2,[2]]).values)
print((train_df.iloc[1:2,[2]]).values.tolist())
print(len(set(((train_df.iloc[1:2,[2]]).values.tolist())[0][0])))
print(train_df.iloc[1:2,[3]]) # 第4列為對(duì)應(yīng)詞組的個(gè)數(shù)
# -------------------------------------------------------------
a = list(filter(None, (((train_df.iloc[0:1,[2]]).values.tolist())[0][0]).split(" ")))
b = list(filter(None, (((train_df.iloc[1:2,[2]]).values.tolist())[0][0]).split(" ")))
print(len(a), len(b), a, b)
a.extend(b)
print(a)
print(len(a), len(set(a)))
# 該數(shù)據(jù)集已經(jīng)進(jìn)行了處理值朋,每個(gè)數(shù)據(jù)集包含4列數(shù)據(jù)叹侄,其中第一列為標(biāo)簽數(shù)據(jù),第二列為新聞的原文數(shù)據(jù)昨登,第三列為經(jīng)過(guò)分詞趾代、去停用詞等操作,并使用空格連接的分詞后數(shù)據(jù)丰辣,第4列為對(duì)應(yīng)詞組的個(gè)數(shù)撒强。
# 數(shù)據(jù)探索:查看訓(xùn)練集都有哪些標(biāo)簽
plt.figure()
sns.countplot(train_df.label)
# plt.xlabel('Label',fontproperties = fonts,size = 10)
# plt.xticks(fontproperties = fonts,size = 10)
plt.xlabel('Label',size = 10)
plt.xticks(size = 10)
plt.show()
# 分析訓(xùn)練集中詞組數(shù)量的分布
print(train_df.cutwordnum.describe())
plt.figure()
plt.hist(train_df.cutwordnum,bins=100)
# plt.xlabel("詞組長(zhǎng)度",fontproperties = fonts,size = 12)
# plt.ylabel("頻數(shù)",fontproperties = fonts,size = 12)
# plt.title("訓(xùn)練數(shù)據(jù)集",fontproperties = fonts)
plt.xlabel("詞組長(zhǎng)度",size = 12)
plt.ylabel("頻數(shù)",size = 12)
plt.title("訓(xùn)練數(shù)據(jù)集")
plt.show()
# 接下來(lái)對(duì)數(shù)據(jù)集的標(biāo)簽數(shù)據(jù)進(jìn)行編碼,首先是LabelEncoder()編碼笙什,然后是進(jìn)行OneHotEncoder()編碼飘哨。-------------------------------編碼標(biāo)簽
# 對(duì)數(shù)據(jù)集的標(biāo)簽數(shù)據(jù)進(jìn)行編碼
train_y = train_df.label
val_y = val_df.label
test_y = test_df.label
le = LabelEncoder()
train_y = le.fit_transform(train_y).reshape(-1,1)
val_y = le.transform(val_y).reshape(-1,1)
test_y = le.transform(test_y).reshape(-1,1)
# 對(duì)數(shù)據(jù)集的標(biāo)簽數(shù)據(jù)進(jìn)行one-hot編碼
ohe = OneHotEncoder()
train_y = ohe.fit_transform(train_y).toarray()
val_y = ohe.transform(val_y).toarray()
test_y = ohe.transform(test_y).toarray()
# 使用Tokenizer對(duì)詞組進(jìn)行編碼--------------------------------------------------------------------------------------------編碼文本
# 當(dāng)我們創(chuàng)建了一個(gè)Tokenizer對(duì)象后,使用該對(duì)象的fit_on_texts()函數(shù)琐凭,以空格去識(shí)別每個(gè)詞,
# 可以將輸入的文本中的每個(gè)詞編號(hào)芽隆,編號(hào)是根據(jù)詞頻的,詞頻越大统屈,編號(hào)越小胚吁。
max_words = 5000
max_len = 600
tok = Tokenizer(num_words=max_words) ## 使用的最大詞語(yǔ)數(shù)為5000
tok.fit_on_texts(train_df.cutword)
# 使用word_index屬性可以看到每次詞對(duì)應(yīng)的編碼
# 使用word_counts屬性可以看到每個(gè)詞對(duì)應(yīng)的頻數(shù)
for ii,iterm in enumerate(tok.word_index.items()):
if ii < 10:
print(iterm)
else:
break
print("===================")
for ii,iterm in enumerate(tok.word_counts.items()):
if ii < 10:
print(iterm)
else:
break
# 使用tok.texts_to_sequences()將數(shù)據(jù)轉(zhuǎn)化為序列,并使用sequence.pad_sequences()將每個(gè)序列調(diào)整為相同的長(zhǎng)度
# 對(duì)每個(gè)詞編碼之后愁憔,每句新聞中的每個(gè)詞就可以用對(duì)應(yīng)的編碼表示腕扶,即每條新聞可以轉(zhuǎn)變成一個(gè)向量了:
train_seq = tok.texts_to_sequences(train_df.cutword)
val_seq = tok.texts_to_sequences(val_df.cutword)
test_seq = tok.texts_to_sequences(test_df.cutword)
## 將每個(gè)序列調(diào)整為相同的長(zhǎng)度
train_seq_mat = sequence.pad_sequences(train_seq,maxlen=max_len)
val_seq_mat = sequence.pad_sequences(val_seq,maxlen=max_len)
test_seq_mat = sequence.pad_sequences(test_seq,maxlen=max_len)
print(train_seq_mat.shape)
print(val_seq_mat.shape)
print(test_seq_mat.shape)
# 定義LSTM模型
inputs = Input(name='inputs',shape=[max_len])
## Embedding(詞匯表大小,batch大小,每個(gè)新聞的詞長(zhǎng))
layer = Embedding(max_words+1,128,input_length=max_len)(inputs)
# layer = LSTM(128)(layer)
layer = LSTM(8)(layer)
layer = Dense(128,activation="relu",name="FC1")(layer)
layer = Dropout(0.5)(layer)
layer = Dense(10,activation="softmax",name="FC2")(layer)
model = Model(inputs=inputs,outputs=layer)
model.summary()
model.compile(loss="categorical_crossentropy",optimizer=RMSprop(),metrics=["accuracy"])
# 模型訓(xùn)練
# model_fit = model.fit(train_seq_mat,train_y,batch_size=128,epochs=10,
# validation_data=(val_seq_mat,val_y),
# callbacks=[EarlyStopping(monitor='val_loss',min_delta=0.0001)] ## 當(dāng)val-loss不再提升時(shí)停止訓(xùn)練
# )
model_fit = model.fit(train_seq_mat,train_y,batch_size=128,epochs=1,
validation_data=(val_seq_mat,val_y),
callbacks=[EarlyStopping(monitor='val_loss',min_delta=0.0001)] ## 當(dāng)val-loss不再提升時(shí)停止訓(xùn)練
)
# -----------------------------------------------------------------------------------------------------------------
# 對(duì)測(cè)試集進(jìn)行預(yù)測(cè)
test_pre = model.predict(test_seq_mat)
## 評(píng)價(jià)預(yù)測(cè)效果,計(jì)算混淆矩陣
confm = metrics.confusion_matrix(np.argmax(test_pre,axis=1),np.argmax(test_y,axis=1))
## 混淆矩陣可視化
Labname = ["體育","娛樂(lè)","家居","房產(chǎn)","教育","時(shí)尚","時(shí)政","游戲","科技","財(cái)經(jīng)"]
plt.figure(figsize=(8,8))
sns.heatmap(confm.T, square=True, annot=True,
fmt='d', cbar=False,linewidths=.8,
cmap="YlGnBu")
plt.xlabel('True label',size = 14)
plt.ylabel('Predicted label',size = 14)
plt.xticks(np.arange(10)+0.5,Labname,fontproperties = fonts,size = 12)
plt.yticks(np.arange(10)+0.3,Labname,fontproperties = fonts,size = 12)
plt.show()
print(metrics.classification_report(np.argmax(test_pre,axis=1),np.argmax(test_y,axis=1)))
參考:
https://www.cnblogs.com/BobHuang/p/11157489.html
https://kexue.fm/archives/3414
https://zhuanlan.zhihu.com/p/39884984
http://www.reibang.com/p/caee648f6a1f
https://zhuanlan.zhihu.com/p/50657430