import os
import os.path
import codecs
import pandas as pd
import numpy as np
filePaths = []
fileContents=[]
a=os.walk("C:/Users/dell/Desktop/datamining/2.1+語(yǔ)料庫(kù)/2.1/SogouC.mini/Sample")
for root, dirs, files in a:
for name in files:
filePath=os.path.join(root,name)
filePaths.append(filePath)
f = codecs.open(filePath, 'r','utf-8')
fileContent=f.read()
f.close()
fileContents.append(fileContent)
corpos = pd.DataFrame({'filePath': filePaths,'fileContent':fileContents})
corpos
segments=[]
filePaths =[]
for index, row in corpos.iterrows():
filePath = row['filePath']
fileContent = row['fileContent']
segs = jieba.cut(fileContent)
for seg in segs:
segments.append(seg)
filePaths.append(filePath)
segmentDataFrame = pd.DataFrame({'segment':segments,'filePath':filePaths})
segmentDataFrame
corpos.iterrows
segStat = segmentDataFrame.groupby(by='segment')["segment"].agg({"計(jì)數(shù)":np.size}).reset_index().sort_values('計(jì)數(shù)',ascending=False)
segmentDataFrame
我們發(fā)現(xiàn)存在jieba切分后有一些停用詞在干擾丛肢,類似空格漆羔、標(biāo)點(diǎn)以及一些中文中的介詞助詞等等。
所以担神,此時(shí)我們需要導(dǎo)入一個(gè)停用詞庫(kù)楼吃,停用詞庫(kù)中的詞就不要放入切詞數(shù)組中。
在這里需要注意的是妄讯,DataFrame沒(méi)有sort這個(gè)屬性孩锡,查了下,DataFrame有sort_values,
具體用法就是.sort_values('列名', ascending=False)
stopwords = pd.read_csv("C:\\Users\\dell\\Desktop\\datamining\\2.3\\StopwordsCN.txt",encoding='utf-8',index_col=False)
fSegStat = segStat[~segStat.segment.isin(stopwords.stopword)]
fSegStat
segments=[]
filePaths =[]
for index, row in corpos.iterrows():
filePath = row['filePath']
fileContent = row['fileContent']
segs = jieba.cut(fileContent)
for seg in segs:
if seg not in stopwords.stopword.values and len(seg.strip())>1:
segments.append(seg)
filePaths.append(filePath)
segmentDataFrame = pd.DataFrame({'segment':segments,'filePath':filePaths})
詞云圖
地址:https://www.lfd.uci.edu/~gohlke/pythonlibs/#wordcloud
pip install wordcloud-1.4.1-cp36-cp36m-win_amd64.whl
segStat=segmentDataFrame.groupby(by='segment')['segment'].agg({'計(jì)數(shù)':np.size}).reset_index().sort_values('計(jì)數(shù)',ascending=False)
fSegStat = segStat[~segStat.segment.isin(stopwords.stopword)]
from wordcloud import WordCloud
import matplotlib.pyplot as plt
wordcloud=WordCloud(font_path='C:\\Users\\Data Engineer\\Desktop\\xx\\2.4 詞云繪制\\2.4\\simhei.ttf',background_color='black')
words = fSegStat.set_index('segment').to_dict()
wordcloud.fit_words(words['計(jì)數(shù)'])
plt.imshow(wordcloud)
plt.show()
網(wǎng)上找了一篇有關(guān)燃料電池發(fā)展?fàn)顩r的分析報(bào)告:將文本轉(zhuǎn)化為txt亥贸。
f = codecs.open('C:\\Users\\Data Engineer\\Desktop\\xx\\2.4 詞云繪制\\2.4\\fuelcell.txt', 'r', 'gbk')
txt=f.read()
txtcontent=jieba.cut(txt)
contents=[]
for content in txtcontent:
if content not in stopwords.stopword.values and len(content.strip())>1:
contents.append(content)
contentDataFrame=pd.DataFrame({'content':contents})
contentStat=contentDataFrame.groupby(by='content')['content'].agg({'計(jì)數(shù)':np.size}).reset_index().sort_values('計(jì)數(shù)',ascending=False)
wordcloud=WordCloud(font_path='C:\\Users\\Data Engineer\\Desktop\\xx\\2.4 詞云繪制\\2.4\\simhei.ttf',background_color='black')
words = contentStat.set_index('content').to_dict()
wordcloud.fit_words(words['計(jì)數(shù)'])
plt.imshow(wordcloud)
plt.show()