1 問(wèn)題描述
問(wèn)題:郵件分類問(wèn)題(Email classification)
任務(wù):將郵件分為兩類(spam or ham)
數(shù)據(jù)集:https://www.kaggle.com/uciml/sms-spam-collection-dataset#spam.csv
2 數(shù)據(jù)處理
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from textblob import Word
import re
from sklearn.model_selection import train_test_split
讀取數(shù)據(jù)
# 讀取數(shù)據(jù)
data = pd.read_csv('spam.csv', encoding = "ISO-8859-1")
data.columns
Index(['v1', 'v2', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], dtype='object')
# 查看前5行數(shù)據(jù)
data.head()
去除無(wú)用數(shù)據(jù)
# 去除無(wú)用數(shù)據(jù)周荐,后3列是無(wú)用數(shù)據(jù)
data = data[['v1', 'v2']]
data.head()
修改表頭信息
# 修改表頭信息
data = data.rename(columns={"v1":"label","v2":"text"})
data.head()
去除標(biāo)點(diǎn)符號(hào)及多余的空格
# 去除標(biāo)點(diǎn)符號(hào)及兩個(gè)以上的空格
data['text'] = data['text'].apply(lambda x:re.sub('[!@#$:).;,?&]', ' ', x.lower()))
data['text'] = data['text'].apply(lambda x:re.sub(' ', ' ', x))
data['text'][0]
'go until jurong point crazy available only in bugis n great world la e buffet cine there got amore wat '
單詞轉(zhuǎn)換為小寫
# 單詞轉(zhuǎn)換為小寫
data['text'] = data['text'].apply(lambda x:" ".join(x.lower() for x in x.split()))
# 或者
#data['text'] = data['text'].apply(lambda x:x.lower())
data['text'][0]
'go until jurong point crazy available only in bugis n great world la e buffet cine there got amore wat'
去除停止詞
# 去除停止詞 喂窟,如a勾习、an沫屡、the济竹、高頻介詞其爵、連詞强胰、代詞等
stop = stopwords.words('english')
data['text'] = data['text'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
data['text'][0]
'go jurong point crazy available bugis n great world la e buffet cine got amore wat'
分詞處理
# 分詞處理迅箩,希望能夠?qū)崿F(xiàn)還原英文單詞原型
st = PorterStemmer()
data['text'] = data['text'].apply(lambda x: " ".join([st.stem(word) for word in x.split()]))
data['text'] = data['text'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
data['text'][0]
'go jurong point crazi avail bugi n great world la e buffet cine got amor wat'
data.head()
3 特征提取
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
Using TensorFlow backend.
分出訓(xùn)練集和測(cè)試集
#以 8:2 的比例分出訓(xùn)練集和測(cè)試集
train, test = train_test_split(data, test_size=0.2)
設(shè)置參數(shù)
# 每個(gè)序列的最大長(zhǎng)度侨歉,多了截?cái)辔菀。倭搜a(bǔ)0
max_sequence_length = 300
#只保留頻率最高的前20000個(gè)詞
num_words = 20000
# 嵌入的維度
embedding_dim = 100
構(gòu)建分詞器
# 找出經(jīng)常出現(xiàn)的單詞,分詞器
tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(train.text)
train_sequences = tokenizer.texts_to_sequences(train.text)
test_sequences = tokenizer.texts_to_sequences(test.text)
# dictionary containing words and their index
word_index = tokenizer.word_index
# print(tokenizer.word_index)
# total words in the corpus
print('Found %s unique tokens.' % len(word_index))
# get only the top frequent words on train
train_x = pad_sequences(train_sequences, maxlen=max_sequence_length)
# get only the top frequent words on test
test_x = pad_sequences(test_sequences, maxlen=max_sequence_length)
print(train_x.shape)
print(test_x.shape)
Found 6702 unique tokens.
(4457, 300)
(1115, 300)
標(biāo)簽向量化
# 標(biāo)簽向量化
# [0,1]: ham;[1,0]:spam
import numpy as np
def lable_vectorize(labels):
label_vec = np.zeros([len(labels),2])
for i, label in enumerate(labels):
if str(label)=='ham':
label_vec[i][0] = 1
else:
label_vec[i][1] = 1
return label_vec
train_y = lable_vectorize(train['label'])
test_y = lable_vectorize(test['label'])
# 或者
from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical
# converts the character array to numeric array. Assigns levels to unique labels.
train_labels = train['label']
test_labels = test['label']
le = LabelEncoder()
le.fit(train_labels)
train_labels = le.transform(train_labels)
test_labels = le.transform(test_labels)
# changing data types
labels_train = to_categorical(np.asarray(train_labels))
labels_test = to_categorical(np.asarray(test_labels))
4 構(gòu)建模型并訓(xùn)練
# Import Libraries
import sys, os, re, csv, codecs, numpy as np, pandas as pd
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense, Input, LSTM, Embedding,Dropout, Activation
from keras.layers import Bidirectional, GlobalMaxPool1D,Conv1D, SimpleRNN
from keras.models import Model
from keras.models import Sequential
from keras import initializers, regularizers, constraints,optimizers, layers
from keras.layers import Dense, Input, Flatten, Dropout,BatchNormalization
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Sequential
model = Sequential()
model.add(Embedding(num_words,
embedding_dim,
input_length=max_sequence_length))
model.add(Dropout(0.5))
model.add(Conv1D(128, 5, activation='relu'))
model.add(MaxPooling1D(5))
model.add(Dropout(0.5))
model.add(BatchNormalization())
model.add(Conv1D(128, 5, activation='relu'))
model.add(MaxPooling1D(5))
model.add(Dropout(0.5))
model.add(BatchNormalization())
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dense(2, activation='softmax'))
model.compile(loss='categorical_crossentropy',
optimizer='rmsprop',
metrics=['acc'])
model.fit(train_x, train_y,
batch_size=64,
epochs=5,
validation_split=0.2)
Train on 3565 samples, validate on 892 samples
Epoch 1/5
3565/3565 [==============================] - 25s 7ms/step - loss: 0.3923 - acc: 0.8480 - val_loss: 0.1514 - val_acc: 0.9451
Epoch 2/5
3565/3565 [==============================] - 23s 7ms/step - loss: 0.1729 - acc: 0.9372 - val_loss: 0.0789 - val_acc: 0.9753
Epoch 3/5
3565/3565 [==============================] - 25s 7ms/step - loss: 0.0940 - acc: 0.9731 - val_loss: 0.2079 - val_acc: 0.9787
Epoch 4/5
3565/3565 [==============================] - 23s 7ms/step - loss: 0.0590 - acc: 0.9857 - val_loss: 0.3246 - val_acc: 0.9843
Epoch 5/5
3565/3565 [==============================] - 23s 7ms/step - loss: 0.0493 - acc: 0.9882 - val_loss: 0.3150 - val_acc: 0.9877
<keras.callbacks.History at 0x1cac6187940>
5 模型評(píng)估
# [0.07058866604882806, 0.9874439467229116]
model.evaluate(test_x, test_y)
1115/1115 [==============================] - 2s 2ms/step
[0.32723046118903054, 0.97847533632287]
# prediction on test data
predicted=model.predict(test_x)
predicted
array([[0.71038646, 0.28961352],
[0.71285075, 0.28714925],
[0.7101978 , 0.28980213],
...,
[0.7092874 , 0.29071262],
[0.70976096, 0.290239 ],
[0.70463425, 0.29536578]], dtype=float32)
#模型評(píng)估
import sklearn
from sklearn.metrics import precision_recall_fscore_support as score
precision, recall, fscore, support = score(test_y,predicted.round())
print('precision: {}'.format(precision))
print('recall: {}'.format(recall))
print('fscore: {}'.format(fscore))
print('support: {}'.format(support))
print("############################")
print(sklearn.metrics.classification_report(test_y,predicted.round()))
precision: [0.97961264 0.97014925]
recall: [0.99585492 0.86666667]
fscore: [0.98766701 0.91549296]
support: [965 150]
############################
precision recall f1-score support
0 0.98 1.00 0.99 965
1 0.97 0.87 0.92 150
avg / total 0.98 0.98 0.98 1115
文章來(lái)源: https://foochane.cn/article/2019052202.html