DataFound 2019用戶畫像競賽分析
1. 數(shù)據(jù)解釋
['用戶編碼', '用戶實名制是否通過核實', '用戶年齡', '是否大學生客戶', '是否黑名單客戶', '是否4G不健康客戶','用戶網(wǎng)齡(月)', '用戶最近一次繳費距今時長(月)', '繳費用戶最近一次繳費金額(元)', '用戶近6個月平均消費值(元)','用戶賬單當月總費用(元)', '用戶當月賬戶余額(元)', '繳費用戶當前是否欠費繳費', '用戶話費敏感度', '當月通話交往圈人數(shù)','是否經(jīng)常逛商場的人', '近三個月月均商場出現(xiàn)次數(shù)', '當月是否逛過福州倉山萬達', '當月是否到過福州山姆會員店', '當月是否看電影','當月是否景點游覽', '當月是否體育場館消費', '當月網(wǎng)購類應用使用次數(shù)', '當月物流快遞類應用使用次數(shù)','當月金融理財類應用使用總次數(shù)', '當月視頻播放類應用使用次數(shù)', '當月飛機類應用使用次數(shù)', '當月火車類應用使用次數(shù)','當月旅游資訊類應用使用次數(shù)', '信用分']
['uid','true_name_flag','age','uni_student_flag','blk_list_flag','4g_unhealth_flag','net_age_till_now','top_up_month_diff','top_up_amount','recent_6month_avg_use','total_account_fee','curr_month_balance','curr_overdue_flag','cost_sensitivity','connect_num','freq_shopping_flag','recent_3month_shopping_count','wanda_flag','sam_flag','movie_flag','tour_flag','sport_flag','online_shopping_count','express_count','finance_app_count','video_app_count','flight_count','train_count','tour_app_count','score']
2. 特征提取
- 充值金額:
- 整數(shù)和小數(shù)對應不同的充值渠道所计。
比如說整數(shù)對應現(xiàn)金渠道席覆,小數(shù)對應在線支付。
- 算充話費的穩(wěn)定性
- 用戶賬單當月總費用(元) / 用戶近6個月平均消費值(元)
- 當月話費使用率
- 計算除用戶ID以外的所有的列的和
3. 用到的求和函數(shù)
- 計算各列數(shù)據(jù)總和并作為新列添加到末尾
df['col_sum'] = df.apply(lambda x: x.sum(), axis=1)
- 計算各行數(shù)據(jù)總和并作為新行添加到末尾
df.loc['row_sum'] = df.apply(lambda x: x.sum())
4. 代碼實現(xiàn)
# 包導入
import time
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
# 輸入數(shù)據(jù)
data_path = '../input/'
train_data = pd.read_csv(data_path + 'train_dataset.csv')
test_data = pd.read_csv(data_path + 'test_dataset.csv')
sample_sub = pd.read_csv(data_path + 'submit_example.csv')
# 預處理
train_data.head(1)
print(train_data.columns)
train_data.columns = ['uid','true_name_flag','age','uni_student_flag','blk_list_flag',\
'4g_unhealth_flag','net_age_till_now','top_up_month_diff','top_up_amount',\
'recent_6month_avg_use','total_account_fee','curr_month_balance',\
'curr_overdue_flag','cost_sensitivity','connect_num','freq_shopping_flag',\
'recent_3month_shopping_count','wanda_flag','sam_flag','movie_flag',\
'tour_flag','sport_flag','online_shopping_count','express_count',\
'finance_app_count','video_app_count','flight_count','train_count',\
'tour_app_count','score']
test_data.columns = train_data.columns[:-1]
# 特征提取
# top up amount, 充值金額是整數(shù),和小數(shù)火窒,應該對應不同的充值途徑零酪?
def produce_offline_feat(train_data):
train_data['top_up_amount_offline'] = 0
train_data['top_up_amount_offline'][(train_data['top_up_amount'] % 10 == 0) & train_data['top_up_amount'] != 0] = 1
return train_data
train_data = produce_offline_feat(train_data)
test_data = produce_offline_feat(test_data)
def produce_fee_rate(train_data):
# 看importance冒嫡,當月話費 和最近半年平均話費都很高,算一下當月/半年 -->穩(wěn)定性
train_data['current_fee_stability'] = train_data['total_account_fee'] / (train_data['recent_6month_avg_use'] + 1)
# 當月話費/當月賬戶余額
train_data['use_left_rate'] = train_data['total_account_fee'] / (train_data['curr_month_balance'] + 1)
return train_data
train_data = produce_fee_rate(train_data)
test_data = produce_fee_rate(test_data)
# 訓練
def display_importances(feature_importance_df_):
cols = feature_importance_df_[["feature", "importance"]].groupby("feature").mean().sort_values(by="importance", ascending=False)[:40].index
best_features = feature_importance_df_.loc[feature_importance_df_.feature.isin(cols)]
plt.figure(figsize=(8, 10))
sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False))
plt.title('LightGBM Features (avg over folds)')
plt.tight_layout()
plt.show()
plt.savefig('feature_importance.jpg')
# para
params = {
'learning_rate': 0.01,
'boosting_type': 'gbdt',
'objective': 'regression_l1',
'metric': 'mae',
'feature_fraction': 0.6,
'bagging_fraction': 0.8,
'bagging_freq': 2,
'num_leaves': 31,
'verbose': -1,
'max_depth': 5,
'lambda_l2': 5,
'lambda_l1': 0,
'nthread': 8
}
# para
params2 = {
'learning_rate': 0.01,
'boosting_type': 'gbdt',
'objective': 'regression_l2',
'metric': 'mae',
'feature_fraction': 0.6,
'bagging_fraction': 0.8,
'bagging_freq': 2,
'num_leaves': 31,
'verbose': -1,
'max_depth': 5,
'lambda_l2': 5,
'lambda_l1': 0,
'nthread': 8,
'seed': 89
}
cv_pred_all = 0
en_amount = 3
for seed in range(en_amount):
NFOLDS = 5
train_label = train_data['score']
kfold = StratifiedKFold(n_splits=NFOLDS, shuffle=True, random_state=seed)
kf = kfold.split(train_data, train_label)
train_data_use = train_data.drop(['uid', 'score', 'blk_list_flag'], axis=1)
test_data_use = test_data.drop(['uid', 'blk_list_flag'], axis=1)
cv_pred = np.zeros(test_data.shape[0])
valid_best_l2_all = 0
feature_importance_df = pd.DataFrame()
count = 0
for i, (train_fold, validate) in enumerate(kf):
print('fold: ', i, ' training')
X_train, X_validate, label_train, label_validate = train_data_use.iloc[train_fold, :], train_data_use.iloc[validate, :], train_label[train_fold], train_label[validate]
dtrain = lgb.Dataset(X_train, label_train)
dvalid = lgb.Dataset(X_validate, label_validate, reference=dtrain)
bst = lgb.train(params, dtrain, num_boost_round=10000, valid_sets=dvalid, verbose_eval=-1, early_stopping_rounds=50)
cv_pred += bst.predict(test_data_use, num_iteration=bst.best_iteration)
valid_best_l2_all += bst.best_score['valid_0']['l1']
fold_importance_df = pd.DataFrame()
fold_importance_df["feature"] = list(X_train.columns)
fold_importance_df["importance"] = bst.feature_importance(importance_type='split', iteration=bst.best_iteration)
fold_importance_df["fold"] = count + 1
feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
count += 1
cv_pred /= NFOLDS
valid_best_l2_all /= NFOLDS
cv_pred_all += cv_pred
cv_pred_all /= en_amount
print('cv score for valid is: ', 1 / (1 + valid_best_l2_all))
cv_pred_all2 = 0
en_amount = 3
for seed in range(en_amount):
NFOLDS = 5
train_label = train_data['score']
kfold = StratifiedKFold(n_splits=NFOLDS, shuffle=True, random_state=(seed + 2019))
kf = kfold.split(train_data, train_label)
train_data_use = train_data.drop(['uid', 'score', 'blk_list_flag'], axis=1)
test_data_use = test_data.drop(['uid', 'blk_list_flag'], axis=1)
cv_pred = np.zeros(test_data.shape[0])
valid_best_l2_all = 0
feature_importance_df = pd.DataFrame()
count = 0
for i, (train_fold, validate) in enumerate(kf):
print('fold: ', i, ' training')
X_train, X_validate, label_train, label_validate = train_data_use.iloc[train_fold, :], train_data_use.iloc[validate, :], train_label[train_fold], train_label[validate]
dtrain = lgb.Dataset(X_train, label_train)
dvalid = lgb.Dataset(X_validate, label_validate, reference=dtrain)
bst = lgb.train(params2, dtrain, num_boost_round=10000, valid_sets=dvalid, verbose_eval=-1, early_stopping_rounds=50)
cv_pred += bst.predict(test_data_use, num_iteration=bst.best_iteration)
valid_best_l2_all += bst.best_score['valid_0']['l1']
fold_importance_df = pd.DataFrame()
fold_importance_df["feature"] = list(X_train.columns)
fold_importance_df["importance"] = bst.feature_importance(importance_type='split', iteration=bst.best_iteration)
fold_importance_df["fold"] = count + 1
feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
count += 1
cv_pred /= NFOLDS
valid_best_l2_all /= NFOLDS
cv_pred_all2 += cv_pred
cv_pred_all2 /= en_amount
print('cv score for valid is: ', 1/(1+valid_best_l2_all))
display_importances(feature_importance_df)
# 提交
test_data_sub = test_data[['uid']]
test_data_sub['score'] = (cv_pred_all2 + cv_pred_all) / 2
test_data_sub.columns = ['id', 'score']
test_data_sub['score1'] = cv_pred_all
test_data_sub['score2'] = cv_pred_all2
test_data_sub['score'] = test_data_sub['score'].apply(lambda x: int(np.round(x)))
test_data_sub[['id', 'score']].to_csv('../output/result_bagging.csv', index=False)
5. 特征分析結果