train_data = train['sentiment'].values
test_data = test['sentiment'].values
plt.figure(figsize=(12,8))
sns.countplot(x='label', data=train)
plt.title('Distribution of Numbers')
plt.xlabel('Numbers');
print("Number of features used for training: \t", len(train),
"\nNumber of features used for testing: \t", len(test))
# 開始使用隨機(jī)森林分類器
clf = RandomForestClassifier(n_estimators=100) # 定義決策樹的個數(shù)為100
# 開始訓(xùn)練盲厌,訓(xùn)練的X數(shù)據(jù)格式為[[]]栽燕,訓(xùn)練的y值為[]也就是經(jīng)過ravel后的數(shù)據(jù)
model = clf.fit(train[:,1:], train[:,0].ravel())
# 然后預(yù)測
output = model.predict(test[:,1:])
# 計算準(zhǔn)確度
acc = np.mean(output == test[:,0].ravel()) *100
print("The accuracy of the pure RandomForest classifier is: \t", acc, "%")
# 利用
clf = RandomForestClassifier(n_estimators=100) # 100 trees
# 用全部訓(xùn)練數(shù)據(jù)來做訓(xùn)練
target = train_data[:,0].ravel()
train = train_data[:,1:]
model = clf.fit(train, target)
# 用測試集數(shù)據(jù)來預(yù)測最終結(jié)果
output = model.predict(test_data)
print output
# 輸出預(yù)測結(jié)果
pd.DataFrame({"ImageId": range(1, len(output)+1), "Label": output}).to_csv('out.csv', index=False, header=True)