titanic 知識(shí)點(diǎn)總結(jié)

1叫挟，缺失值

填補(bǔ)缺失值可以根據(jù)與其相關(guān)性較高的屬性進(jìn)行分組填充

df_all['Age'] = df_all.groupby(['Sex', 'Pclass'])['Age'].apply(lambda x: x.fillna(x.median()))

如果對某個(gè)特征補(bǔ)的缺失值過多棉姐，可以將其特征中的分布相似的值進(jìn)行合并滓走，已達(dá)到減少偏度，使填補(bǔ)的值不對最后預(yù)測產(chǎn)生很大的影響

2循集，觀察目標(biāo)變量（分類）

survived = df_train['Survived'].value_counts()[1]

not_survived = df_train['Survived'].value_counts()[0]

survived_per = survived / df_train.shape[0] * 100

not_survived_per = not_survived / df_train.shape[0] * 100

print('{} of {} passengers survived and it is the {:.2f}% of the training set.'.format(survived, df_train.shape[0], survived_per))

print('{} of {} passengers didnt survive and it is the {:.2f}% of the training set.'.format(not_survived, df_train.shape[0], not_survived_per))

plt.figure(figsize=(10, 8))

sns.countplot(df_train['Survived'])

plt.xlabel('Survival', size=15, labelpad=15)

plt.ylabel('Passenger Count', size=15, labelpad=15)

plt.xticks((0, 1), ['Not Survived ({0:.2f}%)'.format(not_survived_per), 'Survived ({0:.2f}%)'.format(survived_per)])

plt.tick_params(axis='x', labelsize=13)

plt.tick_params(axis='y', labelsize=13)

plt.title('Training Set Survival Distribution', size=15, y=1.05)

plt.show()

3迎卤，相關(guān)性做圖

fig, axs = plt.subplots(nrows=2, figsize=(20, 20))

sns.heatmap(df_train.drop(['PassengerId'], axis=1).corr(), ax=axs[0], annot=True, square=True, cmap='coolwarm', annot_kws={'size': 14})

sns.heatmap(df_test.drop(['PassengerId'], axis=1).corr(), ax=axs[1], annot=True, square=True, cmap='coolwarm', annot_kws={'size': 14})

for i in range(2):? ?

? ? axs[i].tick_params(axis='x', labelsize=14)

? ? axs[i].tick_params(axis='y', labelsize=14)

axs[0].set_title('Training Set Correlations', size=15)

axs[1].set_title('Test Set Correlations', size=15)

plt.show()

4它匕，觀察目標(biāo)變量與特征（連續(xù)）分布圖

cont_features = ['Age', 'Fare']

surv = df_train['Survived'] == 1

fig, axs = plt.subplots(ncols=2, nrows=2, figsize=(20, 20))

plt.subplots_adjust(right=1.5)

for i, feature in enumerate(cont_features):? ?

? ? # Distribution of survival in feature

? ? #ax 表示左圖還是右圖

? ? sns.distplot(df_train[~surv][feature], label='Not Survived', hist=True, color='#e74c3c', ax=axs[0][i])

? ? sns.distplot(df_train[surv][feature], label='Survived', hist=True, color='#2ecc71', ax=axs[0][i])

? ? # Distribution of feature in dataset

? ? sns.distplot(df_train[feature], label='Training Set', hist=False, color='#e74c3c', ax=axs[1][i])

? ? sns.distplot(df_test[feature], label='Test Set', hist=False, color='#2ecc71', ax=axs[1][i])

? ? axs[0][i].set_xlabel('')

? ? axs[1][i].set_xlabel('')

? ? for j in range(2):? ? ? ?

? ? ? ? axs[i][j].tick_params(axis='x', labelsize=20)

? ? ? ? axs[i][j].tick_params(axis='y', labelsize=20)

? ? axs[0][i].legend(loc='upper right', prop={'size': 20})

? ? axs[1][i].legend(loc='upper right', prop={'size': 20})

? ? axs[0][i].set_title('Distribution of Survival in {}'.format(feature), size=20, y=1.05)

axs[1][0].set_title('Distribution of {} Feature'.format('Age'), size=20, y=1.05)

axs[1][1].set_title('Distribution of {} Feature'.format('Fare'), size=20, y=1.05)

plt.show()

離散

cat_features = ['Embarked', 'Parch', 'Pclass', 'Sex', 'SibSp', 'Deck']

fig, axs = plt.subplots(ncols=2, nrows=3, figsize=(20, 20))

plt.subplots_adjust(right=1.5, top=1.25)

for i, feature in enumerate(cat_features, 1):? ?

? ? plt.subplot(2, 3, i)

? ? sns.countplot(x=feature, hue='Survived', data=df_train)

? ? plt.xlabel('{}'.format(feature), size=20, labelpad=15)

? ? plt.ylabel('Passenger Count', size=20, labelpad=15)? ?

? ? plt.tick_params(axis='x', labelsize=20)

? ? plt.tick_params(axis='y', labelsize=20)

? ? plt.legend(['Not Survived', 'Survived'], loc='upper center', prop={'size': 18})

? ? plt.title('Count of Survival in {} Feature'.format(feature), size=20, y=1.05)

plt.show()

5寞射，對連續(xù)特征進(jìn)行分箱

df_all['Fare'] = pd.qcut(df_all['Fare'], 13)

?著作權(quán)歸作者所有,轉(zhuǎn)載或內(nèi)容合作請聯(lián)系作者