1.簡介
fasttext是facebook開源的一個詞向量與文本分類工具著瓶,在2016年開源的圆,典型應(yīng)用場景是“帶監(jiān)督的文本分類問題”岔绸。提供簡單而高效的文本分類和表征學(xué)習(xí)的方法秦忿,性能比肩深度學(xué)習(xí)而且速度更快内颗。
fastText結(jié)合了自然語言處理和機(jī)器學(xué)習(xí)中最成功的理念捻激。這些包括了使用詞袋以及n-gram袋表征語句制轰,還有使用子字(subword)信息,并通過隱藏表征在類別間共享信息铺罢。我們另外采用了一個softmax層級(利用了類別不均衡分布的優(yōu)勢)來加速運(yùn)算過程艇挨。
2.訓(xùn)練實(shí)例
# -*- coding: utf-8 -*-
from sklearn.externals import joblib
import pandas as pd
import numpy as np
import warnings
import jieba
import re
import time
import fasttext
import random
from stop_words import stop_word
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
from sklearn.metrics import confusion_matrix
warnings.filterwarnings('ignore')
data_content = pd.read_excel('語料.xlsx', index_col = None, encoding = 'utf-8')
contents = data_content['語料'].values
targets = data_content['敏感等級(1高度、2敏感韭赘、3不敏感)'].values
jieba.load_userdict("key_word.csv")
source = []
#數(shù)據(jù)處理
for i in range (0,len(contents)):
content= contents[i]
content_string = re.sub("\|uid|Name|content|dtype|object|[\]\[\:\...\:\.\缩滨!\,\,\·\…\~\。\;\脉漏;\?\-\─\*\—\”\《\》]|[\/\苞冯?\?\、\~\】\【\(\)\)\__\____]", "", content)
content_cut = ''.join(content_string.split())
content_seglist = jieba.lcut(content_cut,cut_all=False)
content_seglist = [word.strip().replace('\ufeff', '') for word in content_seglist if word not in stop_word]#去除停用詞
content_seglist = ' '.join(i for i in content_seglist)
content_text = "__label__"+str(targets[i])+" , "+ content_seglist
source.append(content_text)
x_train, x_test, y_train, y_test = train_test_split(source, targets, test_size = 0.1, random_state=33)
train_text = open('data/train_data.txt', 'w', encoding = 'utf-8')
for sentence in x_train:
#print (sentence)
train_text.write(sentence +"\n")
test_text = open('data/test_data.txt', 'w', encoding = 'utf-8')
for sentence in x_test:
test_text.write(sentence +"\n")
classifier = fasttext.supervised('data/train_data.txt', 'model/classifier.model', label_prefix='__label__')
#result = classifier.test('data/train_data.txt')
labels = classifier.predict_proba('data/test_data.txt', k=3)
print ('輸出預(yù)測結(jié)果')
print (result)
print (labels)
print ('P@1:', result.precision)
print ('R@1:', result.recall)
print ('F@1:', result.f1score)
print ('Number of examples:', result.nexamples)
3.多進(jìn)程預(yù)測
# -*- coding: utf-8 -*-
from sklearn.externals import joblib
import pandas as pd
import numpy as np
import warnings
import jieba
import re
import time
import fasttext
import random
import pymysql as mydb
import threading,time
import queue
from multiprocessing import Process, Pool, freeze_support
from multiprocessing import cpu_count
from stop_words import stop_word
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
from sklearn.metrics import confusion_matrix
warnings.filterwarnings('ignore')
#查詢數(shù)據(jù)
db = mydb.connect(host='XXXXXX', port=XXXX, user='XXXXX', passwd='XXXXX', db='XXXX', charset='utf8') #使用此數(shù)據(jù)庫侧巨,需在sql查詢加上b.yes_no
sql_cmd = "select a.tieba_name, a.post_url, case a.title when '' then 'empty' else a.title end title, case a.content when '' then 'empty' else a.content end content, a.floor, (case when b.reply IS NULL then 'null' when b.reply = '' then 'empty' else b.reply end) reply, from_unixtime(a.time, '%Y-%m-%d') time from s_content_tieba a left join s_huifu_tieba b on a.content_id = b.post_id where from_unixtime(a.time, '%Y-%m-%d') between '2018-11-26' and '2018-11-27'"
data_set = pd.read_sql(sql_cmd, db)
db.close()
#data_set = data_set1.iloc[0:10000,]
#data_set['content_label'] = ''
#data_set['content_prob'] = ''
#data_set['reply_label'] = ''
#data_set['reply_prob'] = ''
lens = (len(data_set))
idx = [i for i in range (lens)]
contents = data_set['content'].values
replies = data_set['reply'].values
jieba.load_userdict("key_word.csv")
pre_model = fasttext.load_model('model/classifier.model.bin', label_prefix='__label__')
#content_labs = []
#reply_labs = []
print ('開始預(yù)測')
def consumer(i):
print(i)
content_one= contents[i]
#print ('content_one')
content_string = re.sub("\|uid|Name|content|dtype|object|[\]\[\:\...\:\.\舅锄!\,\,\·\…\~\司忱。\;\皇忿;\?\-\─\*\—\”\《\》]|[\/\?\?\坦仍、\~\】\【\(\)\)\__\____]", "", content_one)
content_cut = ''.join(content_string.split())
content_seglist1 = jieba.lcut(content_cut,cut_all=False)
content_seglist2 = [word.strip().replace('\ufeff', '') for word in content_seglist1 if word not in stop_word]#去除停用詞
if len(content_seglist2)> 0:
content_seglist3 = [' '.join(j for j in content_seglist2)]
#print ('kaishiyuce')
result_pre = pre_model.predict(content_seglist3)
content_labels = result_pre[0][0]
#data_set['content_prob'].iloc[i]=result_pre[0][0][1]
#data_set['content_label'].iloc[i]=result_pre[0][0][0]
#data_set['content_prob'].iloc[i]=result_pre[0][0][1]
else:
content_labels = 'empty'
reply_one= replies[i]
reply_string = re.sub("\|uid|Name|content|dtype|object|[\]\[\:\...\:\.\鳍烁!\,\,\·\…\~\繁扎。\;\幔荒;\?\-\─\*\—\”\《\》]|[\/\?\?\梳玫、\~\】\【\(\)\)\__\____]|回復(fù).*?:|回復(fù).*?:|回復(fù)\s(\S+)", "", reply_one)
reply_cut = ''.join(reply_string.split())
reply_seglist1 = jieba.lcut(reply_cut,cut_all=False)
reply_seglist2 = [word.strip().replace('\ufeff', '') for word in reply_seglist1 if word not in stop_word]#去除停用詞
if len(reply_seglist2)> 0:
reply_seglist3 = [' '.join(j for j in reply_seglist2)]
result_pre2 = pre_model.predict(reply_seglist3)
reply_labels=result_pre2[0][0]
else:
reply_labels = 'empty'
#data_set['reply_prob'].iloc[i]=result_pre2[0][0][1]
#data_set['reply_label'].iloc[i]=result_pre2[0][0][0]
#data_set['reply_prob'].iloc[i]=result_pre2[0][0][1]
return content_labels, reply_labels
b_time1 = time.time()
pool = Pool(cpu_count())
th = []
th.append(pool.map_async(consumer, idx))
pool.close()
pool.join()
#print (th.get())
ths = []
for a in th:
ths.append(a.get())
thx = [e[0] for e in ths[0]]
thy = [e[1] for e in ths[0]]
data_set['content_label'] = thx
data_set['reply_label'] = thy
#data.to_csv('data.csv')
print (time.time() - b_time1)
#print (time.time() - b_time1)
data_set['序號'] = [a for a in range (len(data_set))]
data_mg1 = pd.pivot_table(data_set, index=['tieba_name', 'post_url', 'title', 'content', 'content_label', 'floor', 'reply', 'reply_label', 'time'])
#data_mg1 = pd.pivot_table(data_set, index=['tieba_name', 'post_url', 'title', 'content', 'content_label', 'content_prob', 'floor', 'reply', 'reply_label', 'reply_prob', 'time'])
data_mg1['序號'] = [a for a in range (len(data_mg1))]
now_date = time.strftime('%Y%m%d',time.localtime(time.time()))
data_mg1.to_csv('data/匹配結(jié)果'+now_date+'.csv')
4.總結(jié)
fasttext非常簡單易用爹梁,如果你想快速感受一下類深度學(xué)習(xí)的效果,可以嘗試一把提澎。它可以完成無監(jiān)督的詞向量的學(xué)習(xí)姚垃,學(xué)習(xí)出來詞向量,保持住詞和詞之間盼忌,相關(guān)詞之間是一個距離比較近的情況莉炉;
也可以用于有監(jiān)督學(xué)習(xí)的文本分類任務(wù),(新聞文本分類碴犬,垃圾郵件分類絮宁、情感分析中文本情感分析,電商中用戶評論的褒貶分析)服协。詳細(xì)原理及詞向量應(yīng)用可參考https://blog.csdn.net/john_bh/article/details/79268850