import os
import io
import numpy
from pandas import DataFrame
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
def readFiles(path):
for root,dirnames,filenames, in os.walk(path):#返回path下的所有文件夾和文件
for filename in filenames:
path=os.path.join(root,filename) #創(chuàng)建path
inBody=False #標記頭部信息
lines=[]
f=io.open(path,'r',encoding='latin1')#打開文件
for line in f:
if inBody:
lines.append(line)
elif line=='\n': #先把頭部信息跳過,找到第一個空行 表明頭部信息已經(jīng)結(jié)束
inBody=True
f.close()
message='\n'.join(lines)
yield path,message
def dataFrameFromDirectory(path,classification):
rows=[]
index=[]
for filename,message in readFiles(path):
rows.append({'message':message,'class':classification})
index.append(filename)
return DataFrame(rows,index=index)
data=DataFrame({'message':[],'class':[]})
data = data.append(dataFrameFromDirectory('C:\\Users\\Sitch\\Downloads\\DataScienceCourse\\emails\\spam', 'spam'))
data = data.append(dataFrameFromDirectory('C:\\Users\\Sitch\\Downloads\\DataScienceCourse\\emails\\ham', 'ham'))
data.head()
examples=['Free Money now!!!!',"Hi,I'm Sitch,how about make a friend?"]
example_counts=vectorizer.transform(examples)
predictions=classifier.predict(example_counts)
劃分訓(xùn)練集和測試集再實驗
train_data=data.sample(frac=0.6)
test_data=data[~data.index.isin(train_data.index)]
vectorizer=CountVectorizer()
counts=vectorizer.fit_transform(train_data['message'].values)
classifier=MultinomialNB()
targets=train_data['class'].values
classifier.fit(counts,targets)
i=0
SC=0
test_counts=vectorizer.transform(test_data['message'].values)
predictions=classifier.predict(test_counts)
for i in range(0,predictions.size):
if test_data['class'].values[i]==predictions[i]:
SC+=1
print("測試集成功率 ",end=' : ')
print(SC/predictions.size)