數(shù)據(jù)集地址:https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
1.下載數(shù)據(jù)集
2.執(zhí)行代碼
代碼如下:
# coding=utf-8
import?pyprind??
import?pandas?as?pd??
import?os??
import?numpy?as?np??
import?re??
from?nltk.stem.porter?import?PorterStemmer??
import?nltk??
from?nltk.corpus?import?stopwords??
from?sklearn.grid_search?import?GridSearchCV??
from?sklearn.pipeline?import?Pipeline??
from?sklearn.linear_model?import?LogisticRegression??
from?sklearn.feature_extraction.text?import?TfidfVectorizer??
import?time??
start?=?time.clock()??
homedir?=?os.getcwd()#獲取當(dāng)前文件的路徑??
#第一步:導(dǎo)入數(shù)據(jù)并輸出到moive_data.csv??
'''''
pbar=pyprind.ProgBar(50000)
labels={'pos':1,'neg':0}#正面和負(fù)面評(píng)論標(biāo)簽
df?=?pd.DataFrame()
for?s?in?('test','train'):
????for?l?in?('pos','neg'):
????????path=homedir+'/aclImdb/%s/%s'?%(s,l)
????????for?file?in?os.listdir(path):
????????????with?open(os.path.join(path,file),'r')?as?infile:
????????????????txt?=infile.read()
????????????df?=df.append([[txt,labels[l]]],ignore_index=True)
????????????pbar.update()
df.columns=['review','sentiment']
np.random.seed(0)
df=df.reindex(np.random.permutation(df.index))#重排數(shù)據(jù)集,打散正負(fù)樣本數(shù)據(jù)
df.to_csv(homedir+'/movie_data.csv',index=False)
'''??
#第二步:文本數(shù)據(jù)清洗和特征向量化??
df=pd.read_csv(homedir+'/movie_data.csv')??
def?preprocessor(text):??
text=re.sub('<[^>]*>','',text)#移除HTML標(biāo)記,#把<>里面的東西刪掉包括內(nèi)容??
emotions=re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',text)??
text=re.sub('[\W]+','?',text.lower())+''.join(emotions).replace('-','')??
return?text??
#print?(preprocessor(df.loc[0,'review'][-50:]))#數(shù)據(jù)集第一行review字段的最后50個(gè)字符??
#print?(preprocessor("This?:)?is?:(?a?test?:-)!"))??
df['review']=df['review'].apply(preprocessor)??
def?tokenizer(text):#提取詞匯??
return?text.split()??
porter=PorterStemmer()??
def?tokenizer_porter(text):#文本分詞并提取詞干??
return?[porter.stem(word)?for?word?in?text.split()]??
nltk.download('stopwords')#停用詞移除(stop-word?removal)硅急,停用詞是文本中常見(jiàn)單不能有效判別信息的詞匯??
stop?=?stopwords.words('english')#獲得英文停用詞集??
#print?([w?for?w?in?tokenizer_porter('a?runner?likes?running?and?runs?a?lot')?if?w?not?in?stop])??
#第三步:模型訓(xùn)練??
X_train=df.loc[:25000,'review'].values??
y_train=df.loc[:25000,'sentiment'].values??
X_test=df.loc[25000:,'review'].values??
y_test=df.loc[25000:,'sentiment'].values??
tfidf=TfidfVectorizer(strip_accents=None,lowercase=False,preprocessor=None)??
param_grid?=?[{'vect__ngram_range':[(1,1)],'vect__stop_words':[stop,None],'vect__tokenizer':[tokenizer,tokenizer_porter],'clf__penalty':['l1','l2'],'clf__C':[1.0,10.1,100.0]},\??
{'vect__ngram_range':[(1,1)],'vect__stop_words':[stop,None],'vect__tokenizer':[tokenizer,tokenizer_porter],'vect__use_idf':[False],'vect__norm':[None],'clf__penalty':['l1','l2'],'clf__C':[1.0,10.1,100.0]}?]??
lr_tfidf?=Pipeline([('vect',tfidf),('clf',LogisticRegression(random_state=0))])??
gs_lr_tfidf=GridSearchCV(lr_tfidf,param_grid,scoring='accuracy',cv=5,verbose=1,n_jobs=-1)??
gs_lr_tfidf.fit(X_train,y_train)??
print?('Best?parameter?set?:%s'?%?gs_lr_tfidf.best_params_)??
print?('CV?Accuracy:%.3f'%gs_lr_tfidf.best_score_)??
clf=gs_lr_tfidf.best_estimator_??
print?('Test?Accuracy:%.3f'%clf.score(X_test,y_test))??
end?=?time.clock()??????
print('finish?all?in?%s'?%?str(end?-?start))
執(zhí)行完成的結(jié)果如下:
3.執(zhí)行代碼時(shí)遇到的問(wèn)題:
(1)No module named pyprind鲜屏,在服務(wù)器的python3 下執(zhí)行.py文件時(shí)出現(xiàn)沒(méi)有對(duì)應(yīng)的包证膨,需要安裝盲泛,下面的指令表示在root權(quán)限下疗隶,在python3 下安裝pyprind霞扬,之后遇到的相同的問(wèn)題糕韧,關(guān)于別的包,解決方法類(lèi)似祥得,替換不同的包即可
sudo python -3 -m pip install pyprind
(2)'encoding' is an invalid keyword argument for this function
解決方法兔沃,將open 改成io.open :
import io
data_file = io.open("F:\\MyPro\\data.yaml", "r", encoding=‘utf-8‘)
(3)'ascii' codec can't encode character u'\x96' in position 1448: ordinal not in range(128)
解決方案,加上如下代碼:
import sys
reload(sys)
sys.setdefaultencoding("utf-8")