Mercedes-Benz Greener Manufacturing
梅賽德斯 - 奔馳綠色制造
你能減少梅賽德斯 - 奔馳在試驗(yàn)臺(tái)上花費(fèi)的時(shí)間嗎得哆?
01.概述
在這次比賽中贩据,戴姆勒正在挑戰(zhàn)Kagglers解決維度的詛咒并減少汽車在測(cè)試臺(tái)上花費(fèi)的時(shí)間闸餐。競(jìng)爭(zhēng)對(duì)手將使用代表梅賽德斯 - 奔馳汽車功能的不同排列的數(shù)據(jù)集來(lái)預(yù)測(cè)通過(guò)測(cè)試所需的時(shí)間。獲勝算法將有助于加快測(cè)試速度近上,從而降低二氧化碳排放量拂铡,同時(shí)不降低戴姆勒的標(biāo)準(zhǔn)葱绒。
02. 筆記分析
該筆記主要參考https://www.kaggle.com/sudalairajkumar/simple-exploration-notebook-mercedes
文章中 關(guān)于 特征提取 以及 查看數(shù)據(jù)分布 的方法很通用哈街,所以做個(gè)記錄以便今后用到拒迅。
細(xì)節(jié)方面會(huì)進(jìn)一步慢慢完善。
導(dǎo)入各類庫(kù)
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
import xgboost as xgb
color = sns.color_palette()
%matplotlibinlinepd.options.mode.chained_assignment=None #default='warn'
pd.options.display.max_columns=999
from subprocess import check_outputprint(check_output(["ls","../input"]).decode("utf8"))
1.觀察 數(shù)據(jù)情況作箍,以及 target 既數(shù)據(jù) y 的分布
# 讀取數(shù)據(jù)前硫,并輸出數(shù)據(jù)的分布
train_df=pd.read_csv("../input/train.csv")
test_df=pd.read_csv("../input/test.csv")
print("Train shape : ",train_df.shape)
print("Test shape : ",test_df.shape)
print(train_df.head())
# target的分布
plt.figure(figsize=(8,6))
plt.scatter(range(train_df.shape[0]), np.sort(train_df.y.values))
plt.xlabel('index', fontsize=12)
plt.ylabel('y', fontsize=12)
plt.show() # 觀察數(shù)據(jù)大小分布
#觀察數(shù)據(jù) 分布
ulimit = 180
train_df['y'].ix[train_df['y']>ulimit] = ulimit
plt.figure(figsize=(12,8))
sns.distplot(train_df.y.values, bins=50, kde=False)
plt.xlabel('y value', fontsize=12)
plt.show()
2.觀察 維度情況:個(gè)數(shù)屹电、分布、數(shù)據(jù)類型
# 觀察各維度的數(shù)據(jù)類型危号,個(gè)數(shù)
dtype_df = train_df.dtypes.reset_index()
dtype_df.columns = ["Count", "Column Type"]
dtype_df.groupby("Column Type").aggregate('count').reset_index()
# 可以發(fā)現(xiàn)主要的 特征值 為 非連續(xù)型 為X0-X10
dtype_df.ix[:10,:] # 輸出前10個(gè)特征的值的類型
3. 觀察缺失值的情況
missing_df = train_df.isnull().sum(axis=0).reset_index()
missing_df.columns = ['column_name', 'missing_count']
missing_df = missing_df.ix[missing_df['missing_count']>0]
missing_df = missing_df.sort_values(by='missing_count')
missing_df
4. 觀察 特征值為 二進(jìn)制 類型的 分布情況外莲; 觀察 特征值為 非連續(xù)型 的分布情況
# 觀察特征值為 二進(jìn)制類型的 分布情況
unique_values_dict = {}
for col in train_df.columns:
if col not in ["ID", "y", "X0", "X1", "X2", "X3", "X4", "X5", "X6", "X8"]:
unique_value = str(np.sort(train_df[col].unique()).tolist())
tlist = unique_values_dict.get(unique_value, [])
tlist.append(col)
unique_values_dict[unique_value] = tlist[:]
for unique_val, columns in unique_values_dict.items():
print("Columns containing the unique values : ",unique_val)
print(columns)
print("--------------------------------------------------")
# 觀察特征值為 非連續(xù)型 的分布情況
var_name = "X1" # 只放了X1
col_order = np.sort(train_df[var_name].unique()).tolist()
plt.figure(figsize=(12,6))
sns.stripplot(x=var_name, y='y', data=train_df, order=col_order)
plt.xlabel(var_name, fontsize=12)
plt.ylabel('y', fontsize=12)
plt.title("Distribution of y variable with "+var_name, fontsize=15)
plt.show()
# 按比例來(lái) 觀察 二進(jìn)制類型的 特征 0與1的分布情況
zero_count_list = []
one_count_list = []
cols_list = unique_values_dict['[0, 1]']
for col in cols_list:
zero_count_list.append((train_df[col]==0).sum())
one_count_list.append((train_df[col]==1).sum())
N = len(cols_list)
ind = np.arange(N)
width = 0.35
plt.figure(figsize=(6,100))
p1 = plt.barh(ind, zero_count_list, width, color='red')
p2 = plt.barh(ind, one_count_list, width, left=zero_count_list, color="blue")
plt.yticks(ind, cols_list)
plt.legend((p1[0], p2[0]), ('Zero count', 'One Count'))
plt.show()
# 按顏色深淺來(lái) 觀察 二進(jìn)制類型的 特征 0與1的分布情況
zero_mean_list = []
one_mean_list = []
cols_list = unique_values_dict['[0, 1]']
for col in cols_list:
zero_mean_list.append(train_df.ix[train_df[col]==0].y.mean())
one_mean_list.append(train_df.ix[train_df[col]==1].y.mean())
new_df = pd.DataFrame({"column_name":cols_list+cols_list, "value":[0]*len(cols_list) + [1]*len(cols_list), "y_mean":zero_mean_list+one_mean_list})
new_df = new_df.pivot('column_name', 'value', 'y_mean')
plt.figure(figsize=(8,80))
sns.heatmap(new_df)
plt.title("Mean of y value across binary variables", fontsize=15)
plt.show()
5. 觀察 ID號(hào):判斷訓(xùn)練集和測(cè)試集的劃分方式【隨機(jī) / 按序】 判斷對(duì)預(yù)測(cè)是否有幫助 【通常作用不大】
# 輸出ID與target的分布
var_name = "ID"
plt.figure(figsize=(12,6))
sns.regplot(x=var_name, y='y', data=train_df, scatter_kws={'alpha':0.5, 's':30})
plt.xlabel(var_name, fontsize=12)
plt.ylabel('y', fontsize=12)
plt.title("Distribution of y variable with "+var_name, fontsize=15)
plt.show()
# 輸出 訓(xùn)練集和測(cè)試集關(guān)于ID的分布
plt.figure(figsize=(6,10))
train_df['eval_set'] = "train"
test_df['eval_set'] = "test"
full_df = pd.concat([train_df[["ID","eval_set"]], test_df[["ID","eval_set"]]], axis=0)
plt.figure(figsize=(12,6))
sns.violinplot(x="eval_set", y='ID', data=full_df)
plt.xlabel("eval_set", fontsize=12)
plt.ylabel('y', fontsize=12)
plt.title("Distribution of ID variable with evaluation set", fontsize=15)
plt.show()
6. 獲取 最重要的特征 : 采用 xgboost 和 隨機(jī)森林 做對(duì)比
# 用xgboost 方法【需進(jìn)一步了解各參數(shù)含義磨确,以及調(diào)參步驟】
for f in ["X0", "X1", "X2", "X3", "X4", "X5", "X6", "X8"]:
lbl = preprocessing.LabelEncoder()
lbl.fit(list(train_df[f].values))
train_df[f] = lbl.transform(list(train_df[f].values))
train_y = train_df['y'].values
train_X = train_df.drop(["ID", "y", "eval_set"], axis=1)
# Thanks to anokas for this #
def xgb_r2_score(preds, dtrain):
labels = dtrain.get_label()
return 'r2', r2_score(labels, preds)
xgb_params = {
'eta': 0.05,
'max_depth': 6,
'subsample': 0.7,
'colsample_bytree': 0.7,
'objective': 'reg:linear',
'silent': 1
}
dtrain = xgb.DMatrix(train_X, train_y, feature_names=train_X.columns.values)
model = xgb.train(dict(xgb_params, silent=0), dtrain, num_boost_round=100, feval=xgb_r2_score, maximize=True)
# plot the important features #
fig, ax = plt.subplots(figsize=(12,18))
xgb.plot_importance(model, max_num_features=50, height=0.8, ax=ax)
plt.show()
# 隨機(jī)森林方法【需進(jìn)一步了解參數(shù)含義和調(diào)參步驟】
from sklearn import ensemble
model = ensemble.RandomForestRegressor(n_estimators=200, max_depth=10, min_samples_leaf=4, max_features=0.2, n_jobs=-1, random_state=0)
model.fit(train_X, train_y)
feat_names = train_X.columns.values
## plot the importances ##
importances = model.feature_importances_
std = np.std([tree.feature_importances_ for tree in model.estimators_], axis=0)
indices = np.argsort(importances)[::-1][:20]
plt.figure(figsize=(12,12))
plt.title("Feature importances")
plt.bar(range(len(indices)), importances[indices], color="r", align="center")
plt.xticks(range(len(indices)), feat_names[indices], rotation='vertical')
plt.xlim([-1, len(indices)])
plt.show()