```python
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import recall_score
import pandas as pd
import jieba
def cut(text):
? ? """
? ? 分隔標題為單個詞語 如? '我愛北京天安門' =>? ['我', '愛', '北京', '天安', '天安門'];
? ? :param text:標題
? ? :return: 空格分隔的列表,列表里面是各種詞語
? ? """
? ? return ' '.join(list(jieba.cut(text,cut_all=True)))
def message_classification():
? ? # 本地讀取數(shù)據(jù)集,并構(gòu)造target集 和 data集
? ? ad = pd.read_csv('廣告.csv')
? ? target = ['廣告']*len(ad['標題'])
? ? kaoyan = pd.read_csv('考研.csv')
? ? target=target+['考研']*len(kaoyan['標題'])
? ? ad = ad['標題'].to_list()
? ? kaoyan = kaoyan['標題'].to_list()
? ? data = []
? ? for text in ad:
? ? ? ? data.append(cut(text))
? ? for text in kaoyan:
? ? ? ? data.append(cut(text))
? ? # print(data[0:10])
? ? # print(cut('我愛北京天安門'))
? ? # 劃分數(shù)據(jù)集
? ? x_train, x_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=33)
? ? #特征工程 文本抽取
? ? transfer = TfidfVectorizer()
? ? x_train = transfer.fit_transform(x_train)
? ? x_test? = transfer.transform(x_test)
? ? # 樸素貝葉斯算法預(yù)估
? ? estimator = MultinomialNB()
? ? # 訓(xùn)練模型機
? ? #添加3交叉驗證
? ? estimator = GridSearchCV(estimator, param_grid = {}, cv=3)
? ? estimator.fit(x_train, y_train)
? ? # 模型評估
? ? # 1) 直接對比真實值和預(yù)測值
? ? y_predict = estimator.predict(x_test)
? ? # 計算準確率
? ? score = estimator.score(x_test, y_test)
? ? print("準確率: ", score)
? ? # 計算綜合值
? ? score = estimator.best_score_
? ? print("綜合值: ", score)
? ? # 計算召回率
? ? recall = recall_score(y_test, y_predict, average='weighted')
? ? print("召回率: ", recall)
? ? return None
if __name__ == '__main__':
? ? message_classification()
```
? ? Building prefix dict from the default dictionary ...
? ? Loading model from cache C:\Users\LOVEWE~1\AppData\Local\Temp\jieba.cache
? ? Loading model cost 1.345 seconds.
? ? Prefix dict has been built succesfully.
? ? 準確率:? 0.8650519031141869
? ? 綜合值:? 0.8403288619645175
? ? 召回率:? 0.8650519031141869