輸入格式
iterable類型的數(shù)據(jù),單詞是要split的
sentences = [['A1'涌韩,'A2'],[]潦蝇,[]癣疟,....]
模型訓(xùn)練
# 引入 word2vec
from gensim.models import word2vec
# 引入日志配置
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
# 引入數(shù)據(jù)集
raw_sentences = ["the quick brown fox jumps over the lazy dogs","yoyoyo you go home now to sleep"]
# 切分詞匯
sentences= [s.encode('utf-8').split() for s in sentences]
# 構(gòu)建模型
model = word2vec.Word2Vec(sentences, min_count=1)
# 進(jìn)行相關(guān)性比較
model.similarity('dogs','you')
模型保存
model.save('/tmp/MyModel')
# 前一組方法保存的文件不能利用文本編輯器查看但是保存了訓(xùn)練的全部信息摸袁,可以在讀取后追加訓(xùn)練
model.save_word2vec_format('/tmp/mymodel.txt',binary = False)
model.save_word2vec_format('/tmp/mymodel.bin.gz',binary = True)
# 后一組方法保存為word2vec文本格式但是保存時(shí)丟失了詞匯樹等部分信息悼潭,不能追加訓(xùn)練
追加+訓(xùn)練
model = gensim.models.Word2Vec.load('/tmp/mymodel')
model.train(more_sentences)
模型加載
model = gensim.models.Word2Vec.load('/tmp/mymodel')