基本概念

語料（Corpus）：一組原始文本的集合谆甜，用于無監(jiān)督地訓(xùn)練文本主題的隱層結(jié)構(gòu)。語料中不需要人工標(biāo)注的附加信息你辣。在Gensim中馅巷，Corpus通常是一個(gè)可迭代的對(duì)象（比如列表）立轧。每一次迭代返回一個(gè)可用于表達(dá)文本對(duì)象的稀疏向量格粪。

向量（Vector）：由一組文本特征構(gòu)成的列表。是一段文本在Gensim中的內(nèi)部表達(dá)氛改。

稀疏向量（SparseVector）：通常帐萎，我們可以略去向量中多余的0元素。此時(shí)胜卤，向量中的每一個(gè)元素是一個(gè)(key, value)的元組

模型（Model）：是一個(gè)抽象的術(shù)語疆导。定義了兩個(gè)向量空間的變換（即從文本的一種向量表達(dá)變換為另一種向量表達(dá)）。

(1)詞帶模型

'''

import jieba

#詞帶模型

#------------

# 定義停用詞葛躏、標(biāo)點(diǎn)符號(hào)

punctuation = ["澈段，", "。", "：", "舰攒；", "败富？"]

# 定義語料

content = ["機(jī)器學(xué)習(xí)帶動(dòng)人工智能飛速的發(fā)展。",

? ? ? ? ? "深度學(xué)習(xí)帶動(dòng)人工智能飛速的發(fā)展摩窃。",

? ? ? ? ? "機(jī)器學(xué)習(xí)和深度學(xué)習(xí)帶動(dòng)人工智能飛速的發(fā)展兽叮。"

? ? ? ? ? ]

segs_1 = [jieba.lcut(con) for con in content]

print(segs_1)

tokenized = []

for sentence in segs_1:

? ? words = []

? ? for word in sentence:

? ? ? ? #去掉停詞

? ? ? ? if word not in punctuation:

? ? ? ? ? ? words.append(word)

? ? tokenized.append(words)

print(tokenized)

# 求并集

bag_of_words = [x for item in segs_1 for x in item if x not in punctuation]

# 去重

bag_of_words = list(set(bag_of_words))

print(bag_of_words)

bag_of_word2vec = []

for sentence in tokenized:

? ? tokens = [1 if token in sentence else 0 for token in bag_of_words]

? ? bag_of_word2vec.append(tokens)

#詞帶向量

print(bag_of_word2vec)

'''

gensim 代碼

dictionary = corpora.Dictionary(tokenized)

print(dictionary)

print(dictionary.token2id)

corpus = [dictionary.doc2bow(sentence)for sentencein tokenized]

print(corpus)

(2)tf-idf 表示

import pickle

from sklearn.feature_extraction.textimport TfidfVectorizer, CountVectorizer

import json

def test(vectorizer, word_bag, test_data):

test_matrix = vectorizer.transform(test_data)

print(test_matrix.shape)

array = test_matrix.toarray()

for ridin range(len(array)):

print(test_data[rid], "\t", "".join(["(%s, %.5f)" % (word_bag[cid], array[rid][cid])for cidin range(len(word_bag))]))

def vectorize():

train_data = ["我來到北京清華大學(xué)", "他來到了網(wǎng)易杭研大廈", "小明碩士畢業(yè) 與中國科學(xué)院", "我愛北京天安門"]

# vectorizer = CountVectorizer()

? ? vectorizer = TfidfVectorizer(norm="l1")

vectorizer.fit(train_data)

# train_matrix = vectorizer.fit_transform(train_data)

? ? word_bag = vectorizer.get_feature_names()

print("[word_bag] %s" %" ".join(word_bag))

print("[vocabulary] %s" % json.dumps(vectorizer.vocabulary_, ensure_ascii=False))

test(vectorizer, word_bag, test_data=train_data)

test(vectorizer, word_bag, test_data=["小明來到北京天安門"])

file_name ="data/vectorizer.pkl"

? ? pickle.dump(vectorizer, open(file_name, "w"), protocol=1)

tfidf_vectorizer = pickle.load(open(file_name, "r"))

test(tfidf_vectorizer, word_bag, test_data=["小明來到北京天安門"])

vectorize()

(3)N-Grams模型

bv = CountVectorizer(ngram_range = (2,2))

bv_matrix = bv.fit_transform(norm_corpus)

bv_matrix = bv_matrix.toarray()

vocab = bv.get_feature_names()

pd.DataFrame(bv_matrix, columns=vocab)

(4)共現(xiàn)矩陣

importnumpyasnpimportpandasaspdimportjieba.analyseimportos# 獲取關(guān)鍵詞def Get_file_keywords(dir):try:formated_data=[]# 每篇文章關(guān)鍵詞的二維數(shù)組set_key_list=[]# 所有關(guān)鍵詞的列表fo=open('dic.txt','r',encoding='UTF-8')keywords=fo.read()forhome,dirs,filesinos.walk(dir):forfilenameinfiles:fullname=os.path.join(home,filename)f=open(fullname,'r',encoding='UTF-8')sentence=f.read()words=" ".join(jieba.analyse.extract_tags(sentence=sentence,topK=30,withWeight=False,allowPOS=('n')))# TF-IDF分詞words=words.split(' ')formated_data.append(words)forwordinwords:ifwordinkeywords:set_key_list.append(word)else:words.remove(word)set_word=list(set(set_key_list))# 所有關(guān)鍵詞的集合returnformated_data,set_wordexceptExceptionasreason:print('出現(xiàn)錯(cuò)誤：',reason)# 初始化矩陣def build_matirx(set_word):edge=len(set_word)+1# 建立矩陣，矩陣的高度和寬度為關(guān)鍵詞集合的長度+1'''matrix = np.zeros((edge, edge), dtype=str)'''# 另一種初始化方法matrix=[[''forjinrange(edge)]foriinrange(edge)]# 初始化矩陣matrix[0][1:]=np.array(set_word)matrix=list(map(list,zip(*matrix)))matrix[0][1:]=np.array(set_word)# 賦值矩陣的第一行與第一列returnmatrix# 計(jì)算各個(gè)關(guān)鍵詞的共現(xiàn)次數(shù)def count_matrix(matrix, formated_data):forrowinrange(1,len(matrix)):# 遍歷矩陣第一行猾愿，跳過下標(biāo)為0的元素forcolinrange(1,len(matrix)):# 遍歷矩陣第一列鹦聪，跳過下標(biāo)為0的元素# 實(shí)際上就是為了跳過matrix中下標(biāo)為[0][0]的元素，因?yàn)閇0][0]為空蒂秘，不為關(guān)鍵詞ifmatrix[0][row]==matrix[col][0]:# 如果取出的行關(guān)鍵詞和取出的列關(guān)鍵詞相同泽本，則其對(duì)應(yīng)的共現(xiàn)次數(shù)為0，即矩陣對(duì)角線為0matrix[col][row]=str(0)else:counter=0# 初始化計(jì)數(shù)器forechinformated_data:# 遍歷格式化后的原始數(shù)據(jù)姻僧，讓取出的行關(guān)鍵詞和取出的列關(guān)鍵詞進(jìn)行組合观挎，# 再放到每條原始數(shù)據(jù)中查詢ifmatrix[0][row]inechandmatrix[col][0]inech:counter+=1else:continuematrix[col][row]=str(counter)returnmatrixdef main():formated_data,set_word=Get_file_keywords('D:\\untitled\\test')print(set_word)print(5244)print(formated_data)matrix=build_matirx(set_word)matrix=count_matrix(matrix,formated_data)data1=pd.DataFrame(matrix)data1.to_csv('data.csv',index=0,columns=None,encoding='utf_8_sig')main()

(2)word2vec

'''
from gensim.models import Word2Vec

import jieba

# 定義停用詞、標(biāo)點(diǎn)符號(hào)

punctuation = [",", "段化。", ":", ";", ".", "'", '"', "’", "?", "/", "-", "+", "&", "(", ")"]

sentences = [

? ? "長江是中國第一大河，干流全長6397公里（以沱沱河為源）造成，一般稱6300公里显熏。流域總面積一百八十余萬平方公里，年平均入海水量約九千六百余億立方米晒屎。以干流長度和入海水量論喘蟆，長江均居世界第三位缓升。",

? ? "黃河，中國古代也稱河蕴轨，發(fā)源于中華人民共和國青海省巴顏喀拉山脈港谊，流經(jīng)青海、四川橙弱、甘肅歧寺、寧夏、內(nèi)蒙古棘脐、陜西斜筐、山西、河南蛀缝、山東9個(gè)省區(qū)顷链，最后于山東省東營墾利縣注入渤海。干流河道全長5464千米屈梁，僅次于長江嗤练，為中國第二長河。黃河還是世界第五長河在讶。",

? ? "黃河,是中華民族的母親河煞抬。作為中華文明的發(fā)祥地,維系炎黃子孫的血脈.是中華民族民族精神與民族情感的象征。",

? ? "黃河被稱為中華文明的母親河真朗。公元前2000多年華夏族在黃河領(lǐng)域的中原地區(qū)形成此疹、繁衍。",

? ? "在蘭州的“黃河第一橋”內(nèi)蒙古托克托縣河口鎮(zhèn)以上的黃河河段為黃河上游遮婶。",

? ? "黃河上游根據(jù)河道特性的不同蝗碎，又可分為河源段、峽谷段和沖積平原三部分旗扑。 ",

? ? "黃河,是中華民族的母親河蹦骑。"

]

sentences = [jieba.lcut(sen) for sen in sentences]

tokenized = []

for sentence in sentences:

? ? words = []

? ? for word in sentence:

? ? ? ? if word not in punctuation:

? ? ? ? ? ? words.append(word)

? ? tokenized.append(words)

model = Word2Vec(tokenized,sg=1, size=100,? window=5,? min_count=2,? negative=1, sample=0.001, hs=1, workers=4)

model.save('model')

model = Word2Vec.load('model')

'''

6文本表示代碼

6文本表示代碼

基本概念

(2)word2vec