導(dǎo)入的包
from __future__ import print_function
import metaknowledge as mk
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
import gensim
from gensim import corpora, models
from stop_words import get_stop_words
from nltk.tokenize import RegexpTokenizer
import pyLDAvis
import pyLDAvis.gensim as gensimvis
mk核心代碼
- 抽取出需要分析的文本并轉(zhuǎn)換為數(shù)組
folder_collec = mk.RecordCollection(r'F:\metaknow\example data')
topic = folder_collec.forNLP(r'F:\metaknow\example data\LDA_folder.csv',
dropList=stopwords,lower=True,removeNumbers=True)
document = topic['abstract']
docs = np.asarray(document)
-
保存csv文件,結(jié)構(gòu)如下:
用于分析的文本文件
genism包的LDA分析
文本數(shù)據(jù)清洗
- 分詞
- 去除停用詞
- 詞向量化
*************************************分詞*****************************
#正則分詞器,將所有英文句子按單詞斷開(kāi)
tokenizer = RegexpTokenizer(r'\w+')
#用于存放分詞流
tokens = []
for l in document:
#對(duì)一篇摘要就行分詞并存為列表
token = tokenizer.tokenize(l)
tokens.append(token)
# tokens.append([tokenizer.tokenize(l) for l in document])
*************************************去除停用詞*****************************
#運(yùn)用get_stop_word加載英文停用詞列表
stopwords = get_stop_words('en')
#存放去除停用詞后的詞庫(kù)
cleaned_tokens = []
for l in tokens:
cleaned_tokens.append([i for i in l if not i in stopwords])
*************************************詞句子向量化*****************************
dictionary = corpora.Dictionary(cleaned_tokens)
#https://blog.csdn.net/xuxiuning/article/details/47720337
#為語(yǔ)料庫(kù)中的每個(gè)單詞分配一個(gè)獨(dú)一無(wú)二的ID,形成字典
array = np.asarray(cleaned_tokens)#轉(zhuǎn)換為數(shù)組
corpus = [dictionary.doc2bow(word) for word in array]
#創(chuàng)建詞袋模型绢涡,將每篇文檔的摘要用向量來(lái)表示,該向量與原來(lái)文本中單詞出現(xiàn)的順序沒(méi)有關(guān)系鳖轰,而是詞典中每個(gè)單詞在文本中出現(xiàn)的頻率清酥。
'''`
如存在一個(gè)語(yǔ)料庫(kù)如下:
{"John": 1, "likes": 2,"to": 3, "watch": 4, "movies": 5,"also": 6, "football": 7, "games": 8,"Mary": 9, "too": 10}
一個(gè)句子向量如下:
[1, 2, 1, 1, 1, 0, 0, 0, 1, 1]
表示John出現(xiàn)了一次,likes出現(xiàn)了兩次蕴侣,to表示0次焰轻,watch表示0次~~~~~~~~~~~~~
'''
投喂模型
ldamodel = gensim.models.ldamodel.LdaModel(corpus=corpus, num_topics=50,
id2word = dictionary, passes=20)
#打印出前10個(gè)主題,以及每個(gè)主題中的前5個(gè)詞語(yǔ)
print(ldamodel.print_topics(num_topics=10, num_words=5))
#存盤(pán)
dictionary.save(r'F:\metaknow\example data\paper_abstracts.dict')
ldamodel.save(r'F:\metaknow\example data\paper_abstracts_lda.model')
前10個(gè)主題結(jié)果
[(38, '0.031*"collaboration" + 0.024*"scientific" + 0.021*"impact" + 0.017*"research" + 0.009*"researchers"'),
(17, '0.021*"scientific" + 0.011*"collaboration" + 0.010*"research" + 0.010*"science" + 0.009*"researchers"'),
(1, '0.034*"scientific" + 0.020*"collaboration" + 0.018*"research" + 0.013*"network" + 0.010*"coauthorship"'),
(4, '0.017*"research" + 0.015*"china" + 0.014*"hivaids" + 0.010*"coauthorship" + 0.010*"collaboration"'),
(33, '0.029*"collaboration" + 0.020*"data" + 0.018*"scientific" + 0.007*"events" + 0.007*"can"'),
(14, '0.023*"collaboration" + 0.016*"scientific" + 0.014*"analysis" + 0.013*"international" + 0.011*"study"'),
(44, '0.018*"scientific" + 0.016*"knowledge" + 0.015*"collaborations" + 0.012*"trust" + 0.008*"commercialization"'),
(5, '0.015*"scientific" + 0.010*"research" + 0.009*"order" + 0.008*"teams" + 0.008*"leadership"'),
(31, '0.024*"collaboration" + 0.024*"research" + 0.022*"scientific" + 0.009*"scientists" + 0.008*"network"'),
(22, '0.028*"research" + 0.020*"collaboration" + 0.019*"scientific" + 0.010*"impact" + 0.009*"collaborative"')]
- 可視化
vis_data = gensimvis.prepare(ldamodel, corpus, dictionary)
pyLDAvis.show(vis_data,open_browser=False)
LDA圖
http://127.0.0.1:8888/#topic=0&lambda=1&term=
sklearn
- tf-idf矩陣
- 非負(fù)矩陣分解
TF-idf
#建立tf-idf矩陣,將文章摘要通過(guò) tf-idf值來(lái)進(jìn)行表示昆雀,也就是用一個(gè)tf-idf值的矩陣來(lái)表示文檔(句子也可)
************************************將原始的文檔轉(zhuǎn)換為tfidf矩陣****************
features = 500
topics = 20
top_words = 10
#創(chuàng)建對(duì)象
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, #如果詞頻在95%的文檔中都出現(xiàn)辱志,則去掉該詞頻
min_df=2,#如果詞頻大于小于2則去掉
max_features=features,#最多500個(gè)詞
#常用英文停用詞表
stop_words='english')
#實(shí)例化
tfidf = tfidf_vectorizer.fit_transform(docs)
#將文本中的詞語(yǔ)轉(zhuǎn)換為詞頻矩陣
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
max_features=features,
stop_words='english')
**********************************非負(fù)矩陣分解********************************
tf = tf_vectorizer.fit_transform(docs)
# 非負(fù)矩陣分解,降維10維
nmf = NMF(n_components=topics, random_state=1, alpha=.1, l1_ratio=.5).fit(tfidf)
#提供打印功能
def print_top_words(model, feature_names, top_words):
for topic_idx, topic in enumerate(model.components_):
print("Topic #%d:" % topic_idx)
print(" ".join([feature_names[i]
for i in topic.argsort()[:-top_words - 1:-1]]))
print()
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, top_words)
sklearn出的結(jié)果
Topic #0:
collaboration scientific collaborative paper information analysis science social results studies
Topic #1:
international science collaboration national domestic world index sci increased countries
Topic #2:
team scientific teaching construction members innovation university method paper performance
Topic #3:
authors articles number coauthorship journals published article author coauthors publications
Topic #4:
network centrality nodes analysis structure social coauthorship evolution collaboration degree
Topic #5:
data access sharing distributed use software resources including experiments provides
Topic #6:
scientists collaborators changes computer work colleagues connected group coauthored early
Topic #7:
researchers academic increasingly publications sample coauthors brazilian findings communities activities
Topic #8:
networks coauthorship social patterns properties structure clustering links ties high
Topic #9:
research university projects community project topics academic health scientific study
Topic #10:
collaborations scientific firms increasingly success physics domestic large performance benefits
Topic #11:
model empirical distribution degree proposed nodes graph evolution coauthorship node
Topic #12:
countries south collaboration production africa country institutions african latin world
Topic #13:
knowledge sharing scientific innovation production practices domain academic science communication
Topic #14:
china chinese chinas usa eu analysis science past collaborative european
Topic #15:
teams team members scientific new productivity work characteristics size related
Topic #16:
papers published collaboration coauthored citation patterns collaborative identified citations fields
Topic #17:
universities colleges management improve university innovation quality focus scientific engineering
Topic #18:
scholars academic collaboration coauthorship algorithm patterns collaborators based new age
Topic #19:
impact citation publications average citations output number publication greater scientific