項目描述:
kaggle上的絕地求生游戲數(shù)據(jù)氛堕,數(shù)據(jù)一共4446966條立肘,共計47965場比賽边坤,玩家ID沒有明確標(biāo)示,參與人數(shù)未知
分析可視化思路:
數(shù)據(jù)字典:
加載數(shù)據(jù)谅年,查看數(shù)據(jù)情況
data = pd.read_csv(r'.\PUBG_Mobile\data\train_V2.csv')
data.describe()
data.info()
共計29個字段茧痒,僅有一條缺失值
剔除可能開掛數(shù)據(jù),異常值
# 剔除可能開掛的數(shù)據(jù),只有一條空數(shù)據(jù)融蹂,直接刪除
data.dropna(inplace=True)
# 殺敵數(shù)大于20
df1 = data[data.DBNOs<=20]
# 剔除在車上殺敵大于3人
df2 = df1[df1.roadKills<=3]
# 沒移動就完成擊殺
df3 = df2[~((df2.walkDistance==0)&(df2.DBNOs>0))]
# 剔除殺敵數(shù)大于3且爆頭率為1的數(shù)據(jù)
data_ed = df3[~((df3.kills>3)&(df3.kills==df3.headshotKills))]
# 玩家ID沒標(biāo)示
print(len(data_ed),data_ed['Id'].nunique(),data_ed.matchId.nunique())
具體分析思路旺订,從分布→排名→吃雞
1.在一局游戲中,玩家自己所受到的傷害
fig, (ax1, ax2) = plt.subplots(1, 2)
fig.set_figwidth(15)
sns.distplot(data_ed['damageDealt'], ax=ax1)
sns.boxplot(data_ed['damageDealt'], ax=ax2)
plt.show()
從上圖可以看出超燃,一局游戲一般玩家受到的傷害0-500
2.擊倒人數(shù)分布情況
plt.figure(dpi=300,figsize=(24,8))
plt.hist(data_ed.DBNOs)
plt.show()
哈哈区拳,大部分人都很善良,未曾擊倒1人
3.擊殺人數(shù)與玩家排名的關(guān)系
# 擊倒人數(shù)與當(dāng)場游戲排名的關(guān)系
plt.figure(figsize= (24, 8),dpi=300)
df4 = data_ed[['DBNOs', 'winPlacePerc']]
sns.set(style="darkgrid")
g = sns.relplot(data=df4,x="DBNOs", y="winPlacePerc",height=8,linewidth=2,aspect=1.3, kind="line")
plt.title('DBNOs / winPlacePerc', fontsize=15)
g.fig.autofmt_xdate()
4.擊倒人數(shù)與游戲排名
# 單變量分析:擊殺人數(shù)與玩家排名的關(guān)系
df4 = data_ed[['kills', 'rankPoints']]
plt.figure(figsize= (30, 10))
sns.set(style="darkgrid")
g = sns.relplot(data=df4,x="kills", y="rankPoints",height=8,linewidth=2,aspect=1.3, kind="line")
g.fig.autofmt_xdate()
ELo分1000為中間點意乓,得分達(dá)到1000以上樱调,同時擊殺人數(shù)需超過30人
5.每種組隊模式的獲勝概率(單排/雙排/四排)
# 查看每種組隊模式的獲勝概率(單排/雙排/四排)
df_matchType_no1 = data_ed[data_ed.winPlacePerc==1].groupby(['matchType']).agg('matchType','count')
df_matchType = data_ed.groupby(['matchType']).agg('matchType','count')
df_matchType_win = pd.merge(df_matchType,df_matchType_no1,left_index=True, right_index=True)
df_matchType_win['勝率'] = df_matchType_win['count']/df_matchType_win[count']
plt.figure(dpi=300,figsize=(24,8))
plt.bar(df_matchType_win.index,df_matchType_win['勝率'])
plt.xticks(rotation=30)
plt.show()
從結(jié)果來看,四排的吃雞概率是最高1.4%
6.步行距離與吃雞的關(guān)系
# 用步行距離與吃雞的關(guān)系walkDistance /winPlacePerc
df_ride = data_ed[['walkDistance', 'winPlacePerc']]
labels=["0k-1k", "1k-2k", "2k-3k", "3k-4k","4k-5k", "5k-6k", "6k-7k", "7k-8k"]
df_ride['walkDistance_cut'] = pd.cut(df_ride['walkDistance'], 8, labels=labels) # pd.cut , 分割pandas 為10個等距子表
df_ride.groupby('walkDistance_cut').winPlacePerc.mean().plot.bar(rot=30, figsize=(24, 8))
plt.xlabel("walkDistance_cut")
plt.ylabel("winPlacePerc")
7.載具移動的距離與吃雞的關(guān)系
# 用載具移動的距離與吃雞的關(guān)系rideDistance /winPlacePerc
df_ride = data_ed.loc[data_ed['rideDistance']<10000, ['rideDistance', 'winPlacePerc']]
labels=["0k-1k", "1k-2k", "2k-3k", "3k-4k","4k-5k", "5k-6k", "6k-7k", "7k-8k"]
df_ride['drive'] = pd.cut(df_ride['rideDistance'], 8, labels=labels) # pd.cut , 分割pandas 為10個等距子表
df_ride.groupby('drive').winPlacePerc.mean().plot.bar(rot=30, figsize=(24, 8))
plt.xlabel("rideDistance")
plt.ylabel("winPlacePerc")
8.增益物品與吃雞的關(guān)系
# 用增益物品與吃雞的關(guān)系boosts/winPlacePerc
df4 = data_ed[['boosts', 'winPlacePerc']]
plt.figure(figsize= (30, 10))
sns.set(style="darkgrid")
g = sns.relplot(data=df4,x="boosts", y="winPlacePerc",height=8,linewidth=2,aspect=1.3, kind="line")
g.fig.autofmt_xdate()
多變量相關(guān)性
#刪除與建模無關(guān)的字段Id groupId matchId matchType
data_m = data.drop(['Id', 'groupId', 'matchId', 'matchType'],axis=1)
matrix = data_m.corr()
cmap = sns.diverging_palette(250, 15, s=70, l=75, n=40, center="light", as_cmap=True)
plt.figure(figsize=(24, 12))
sns.heatmap(matrix, center=0, annot=True,fmt='.2f', square=True, cmap=cmap)
以winplaceperc出發(fā)届良,相關(guān)性比較強的,玩家步行距離笆凌、使用增益物品的數(shù)量,與殺死玩家的數(shù)量為負(fù)相關(guān)
劃分?jǐn)?shù)據(jù)集
y = data_m['winPlacePerc'].values
x = data_m.drop(columns=['winPlacePerc']).values
xtrain,xtest,ytrain,ytest = train_test_split(x,y,test_size=0.3)
線性回歸
# 線性回歸
reg = LR().fit(xtrain,ytrain)
y_hat = reg.predict(xtest)
隨機森林
# 隨機森林
rfc = RandomForestClassifier(random_state=0)
rfc = rfc.fit(xtrain,ytrain.astype('int64'))
rfc_y_hat = rfc.predict(xtest)
# score_r = rfc.score(xtest,ytest.astype('int64'))
分別以RMSE士葫、MSE乞而、R方以及MAE,用以評估回歸模型的精度
# 線性回歸
MSE = metrics.mean_squared_error(ytest, y_hat)
RMSE = metrics.mean_squared_error(ytest, y_hat)**0.5
MAE = metrics.mean_absolute_error(ytest, y_hat)
MSE,RMSE,MAE,
mse=0.016028860503889776, rmse=0.126605136167099378,mae=0.09272709032057316
#隨機森林
MSE = metrics.mean_squared_error(ytest, rfc_y_hat)
RMSE = metrics.mean_squared_error(ytest, rfc_y_hat)**0.5
MAE = metrics.mean_absolute_error(ytest, rfc_y_hat)
MSE,RMSE,MAE,
mse=0.014725708056613685,rmse=0.12134952845649498, mae=0.08928706404803585
借鑒
https://codeantenna.com/a/Rn2nLom4jT
http://www.reibang.com/p/57c0f0266c10
https://www.heywhale.com/mw/project/63f19d69030c7011ddd54ab7