本案例數(shù)據(jù)取自kaggle。
這次的案例使用的數(shù)據(jù)做了脫敏處理经宏,可能通過(guò)降維壓縮或是其他的一些方式進(jìn)行了變換處理犀暑。
1、讀入數(shù)據(jù)
先導(dǎo)入常用庫(kù)
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# 解決坐標(biāo)軸刻度負(fù)號(hào)亂碼
plt.rcParams['axes.unicode_minus'] = False
# 解決中文亂碼問(wèn)題
plt.rcParams['font.sans-serif'] = ['Simhei']
#顯示全部特征
pd.set_option('display.max_columns', None)
%matplotlib inline
讀入數(shù)據(jù)
data=pd.read_csv('creditcard.csv')
data.head()
Class為標(biāo)簽列烁兰,0表示正衬涂鳎客戶,1表示欺詐客戶沪斟。
我們可以看到广辰,V1-V28是經(jīng)過(guò)降維壓縮等轉(zhuǎn)換手段得到的28個(gè)特征,是經(jīng)過(guò)了歸一化的主之,Amount這個(gè)特征表示交易金額择吊,我們是需要將其也進(jìn)行歸一化的。
data.shape
數(shù)據(jù)集維度是(284807, 31)槽奕。
data.info()
數(shù)據(jù)集不存在缺失值且沒(méi)有字符串變量几睛。
data['Class'].value_counts()
標(biāo)簽分布極不平衡。
2史翘、數(shù)據(jù)處理
首先枉长,我們要將Amount標(biāo)準(zhǔn)化冀续,并將不需要用到的Time去掉。
from sklearn.preprocessing import StandardScaler
data['Amount'] = StandardScaler().fit_transform(data['Amount'].reshape(-1,1))
data=data.loc[:,data.columns!='Time']
data.head()
3必峰、樣本不均衡解決方案
由于我們的數(shù)據(jù)極不平衡洪唐,在這里有上采樣(將少數(shù)類增多到和多數(shù)類一樣)和下采樣(減少多數(shù)類使其數(shù)量與少數(shù)類相同)兩種方式。先來(lái)看看下采樣的方式吼蚁。
3.1 下采樣策略
## 下采樣
# 分離特征和標(biāo)簽
X = data.loc[:, data.columns != 'Class']
y = data['Class']
# 計(jì)算少數(shù)類個(gè)數(shù)
number_records_fraud = len(data[data.Class == 1])
# 取得少數(shù)類樣本的索引
fraud_indices = np.array(data[data.Class == 1].index)
# 取得多數(shù)類樣本的索引
normal_indices = data[data.Class == 0].index
# 從多數(shù)類中隨機(jī)選擇與少數(shù)類個(gè)數(shù)相同的樣本數(shù)作為樣本
random_normal_indices = np.random.choice(normal_indices, number_records_fraud, replace = False)
random_normal_indices = np.array(random_normal_indices)
# 合并隨機(jī)取得的0類和全部的1類的索引
under_sample_indices = np.concatenate([fraud_indices,random_normal_indices])
# 根據(jù)索引得到下采樣后的數(shù)據(jù)集
under_sample_data = data.iloc[under_sample_indices,:]
# 分離特征和標(biāo)簽
X_undersample = under_sample_data.ix[:, under_sample_data.columns != 'Class']
y_undersample = under_sample_data.ix[:, under_sample_data.columns == 'Class']
print("正樣本比例(0類): ", len(under_sample_data[under_sample_data.Class == 0])/len(under_sample_data))
print("負(fù)樣本比例(1類): ", len(under_sample_data[under_sample_data.Class == 1])/len(under_sample_data))
print("下采樣后總樣本個(gè)數(shù)為: ", len(under_sample_data))
下采樣后正負(fù)類個(gè)數(shù)相等凭需,樣本總數(shù)減少為984。
劃分測(cè)試集和訓(xùn)練集肝匆。
from sklearn.model_selection import train_test_split
# 切分訓(xùn)練集和測(cè)試集
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.3, random_state = 0)
print("源數(shù)據(jù)訓(xùn)練集樣本數(shù): ", len(X_train))
print("源數(shù)據(jù)測(cè)試集樣本數(shù): ", len(X_test))
print("源數(shù)據(jù)總樣本數(shù): ", len(X_train)+len(X_test))
# 下采樣
X_train_undersample, X_test_undersample, y_train_undersample, y_test_undersample = train_test_split(X_undersample
,y_undersample
,test_size = 0.3
,random_state = 0)
print('\n')
print("下采樣后訓(xùn)練集樣本數(shù): ", len(X_train_undersample))
print("下采樣后測(cè)試集樣本數(shù): ", len(X_test_undersample))
print("下采樣后樣本總數(shù): ", len(X_train_undersample)+len(X_test_undersample))
3.2 正則化參數(shù)擇優(yōu)
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import confusion_matrix,recall_score,classification_report
def printing_Kfold_scores(x_train_data,y_train_data):
fold = KFold(5,shuffle=False)
# 超參數(shù)C范圍
c_param_range = [0.01,0.1,1,10,100]
results_table = pd.DataFrame(index = range(len(c_param_range),2), columns = ['C_parameter','Mean recall score'])
results_table['C_parameter'] = c_param_range
# the k-fold will give 2 lists: train_indices = indices[0], test_indices = indices[1]
j = 0
for c_param in c_param_range:
print('-------------------------------------------')
print('超參數(shù)C: ', c_param)
print('-------------------------------------------')
print('\n')
recall_accs = []
for iteration, indices in enumerate(fold.split(x_train_data)):
# 建立l1正則化的邏輯回歸模型
lr = LogisticRegression(C = c_param, penalty = 'l1')
# 訓(xùn)練模型
lr.fit(x_train_data.iloc[indices[0],:],y_train_data.iloc[indices[0],:].values.ravel())
# 模型預(yù)測(cè)
y_pred_undersample = lr.predict(x_train_data.iloc[indices[1],:].values)
# 計(jì)算召回率
recall_acc = recall_score(y_train_data.iloc[indices[1],:].values,y_pred_undersample)
recall_accs.append(recall_acc)
print('Iteration ', iteration,': recall= ', recall_acc)
# The mean value of those recall scores is the metric we want to save and get hold of.
results_table.ix[j,'Mean recall score'] = np.mean(recall_accs)
j += 1
print('')
print('Mean recall score ', np.mean(recall_accs))
print('')
best_c = results_table.loc[results_table['Mean recall score'].astype('float64').idxmax()]['C_parameter']
# Finally, we can check which C parameter is the best amongst the chosen.
print('*********************************************************************************')
print('超參數(shù)C為{}時(shí)粒蜈,模型交叉驗(yàn)證得分最高'.format(best_c))
print('*********************************************************************************')
return best_c
在0.01、0.1旗国、1枯怖、10、100這五個(gè)不同量級(jí)中選出最優(yōu)的超參數(shù)C(正則化懲罰系數(shù))能曾,評(píng)價(jià)得分標(biāo)準(zhǔn)為recall度硝。
best_c = printing_Kfold_scores(X_train_undersample,y_train_undersample)
3.3 模型評(píng)估
def plot_confusion_matrix(cm, classes,
title='Confusion matrix',
cmap=plt.cm.Blues):
"""
This function prints and plots the confusion matrix.
"""
plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.title(title)
plt.colorbar()
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=0)
plt.yticks(tick_marks, classes)
thresh = cm.max() / 2.
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
plt.text(j, i, cm[i, j],
horizontalalignment="center",
color="white" if cm[i, j] > thresh else "black")
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')
import itertools
lr = LogisticRegression(C = best_c, penalty = 'l1')
lr.fit(X_train_undersample,y_train_undersample.values.ravel())
y_pred_undersample = lr.predict(X_test_undersample.values)
# Compute confusion matrix
cnf_matrix = confusion_matrix(y_test_undersample,y_pred_undersample)
np.set_printoptions(precision=2)
print("測(cè)試集Recall: ", cnf_matrix[1,1]/(cnf_matrix[1,0]+cnf_matrix[1,1]))
# Plot non-normalized confusion matrix
class_names = [0,1]
plt.figure()
plot_confusion_matrix(cnf_matrix
, classes=class_names
, title='Confusion matrix')
plt.show()
測(cè)試集Recall: 0.9387755102040817
上面是模型在下采樣之后的測(cè)試集上的表現(xiàn),我們?cè)賮?lái)看看在沒(méi)有下采樣的測(cè)試集上的表現(xiàn)寿冕。
lr = LogisticRegression(C = best_c, penalty = 'l1')
lr.fit(X_train_undersample,y_train_undersample.values.ravel())
y_pred = lr.predict(X_test.values)
# Compute confusion matrix
cnf_matrix = confusion_matrix(y_test,y_pred)
np.set_printoptions(precision=2)
print("測(cè)試集Recall: ", cnf_matrix[1,1]/(cnf_matrix[1,0]+cnf_matrix[1,1]))
# Plot non-normalized confusion matrix
class_names = [0,1]
plt.figure()
plot_confusion_matrix(cnf_matrix
, classes=class_names
, title='Confusion matrix')
plt.show()
測(cè)試集Recall: 0.9183673469387755
可以看到recall略有降低蕊程,但是通過(guò)混淆矩陣可以看到,將正惩粘客戶誤判為欺詐客戶的人數(shù)為9505藻茂,這其實(shí)是很不好的。
我們?cè)倏匆幌氯绻褂梦唇?jīng)過(guò)下采樣的訓(xùn)練集訓(xùn)練出的模型預(yù)測(cè)效果如何玫恳。
lr = LogisticRegression(C = best_c, penalty = 'l1')
lr.fit(X_train,y_train.values.ravel())
y_pred_undersample = lr.predict(X_test.values)
# 計(jì)算混淆矩陣
cnf_matrix = confusion_matrix(y_test,y_pred_undersample)
np.set_printoptions(precision=2)
print("測(cè)試集Recall: ", cnf_matrix[1,1]/(cnf_matrix[1,0]+cnf_matrix[1,1]))
class_names = [0,1]
plt.figure()
plot_confusion_matrix(cnf_matrix
, classes=class_names
, title='Confusion matrix')
plt.show()
測(cè)試集Recall: 0.5510204081632653
可以看到辨赐,recall值僅有0.55非常低。
3.4 閾值對(duì)結(jié)果的影響
其實(shí)sklearn中的邏輯回歸默認(rèn)概率>0.5判斷為1類纽窟,但我們是可以自己設(shè)定閾值的肖油,不同的閾值會(huì)影響最終的模型評(píng)價(jià)結(jié)果。
lr = LogisticRegression(C = 0.01, penalty = 'l1')
lr.fit(X_train_undersample,y_train_undersample.values.ravel())
y_pred_undersample_proba = lr.predict_proba(X_test_undersample.values)
thresholds = [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]
plt.figure(figsize=(10,10))
j = 1
for i in thresholds:
y_test_predictions_high_recall = y_pred_undersample_proba[:,1] > i
plt.subplot(3,3,j)
j += 1
# 計(jì)算混淆矩陣
cnf_matrix = confusion_matrix(y_test_undersample,y_test_predictions_high_recall)
np.set_printoptions(precision=2)
print("閾值為{}時(shí)測(cè)試集Recall: {}".format(i,cnf_matrix[1,1]/(cnf_matrix[1,0]+cnf_matrix[1,1])))
class_names = [0,1]
plot_confusion_matrix(cnf_matrix
, classes=class_names
, title='Threshold >= %s'%i)
3.5 上采樣策略
我們前面看到了臂港,采用下采樣策略,訓(xùn)練出來(lái)的模型對(duì)測(cè)試集預(yù)測(cè)結(jié)果recall大約為0.91视搏,遠(yuǎn)遠(yuǎn)好于不進(jìn)行下采樣的訓(xùn)練模型recall值(僅為0.55)审孽,但是會(huì)將9505名正常客戶誤判為欺詐客戶浑娜。
那么我們?cè)俨捎蒙喜蓸拥姆绞絹?lái)訓(xùn)練模型佑力,看看效果如何。
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
credit_cards=pd.read_csv('creditcard.csv')
columns=credit_cards.columns
features_columns=columns.delete(len(columns)-1)
features=credit_cards[features_columns]
labels=credit_cards['Class']
features_train, features_test, labels_train, labels_test = train_test_split(features,
labels,
test_size=0.2,
random_state=0)
oversampler=SMOTE(random_state=0)
os_features,os_labels=oversampler.fit_sample(features_train,labels_train)
os_features = pd.DataFrame(os_features)
os_labels = pd.DataFrame(os_labels)
best_c = printing_Kfold_scores(os_features,os_labels)
我們來(lái)看下經(jīng)過(guò)SMOTE上采樣后數(shù)據(jù)訓(xùn)練的模型預(yù)測(cè)效果如何筋遭。
lr = LogisticRegression(C = best_c, penalty = 'l1')
lr.fit(os_features,os_labels.values.ravel())
y_pred = lr.predict(features_test.values)
# Compute confusion matrix
cnf_matrix = confusion_matrix(labels_test,y_pred)
np.set_printoptions(precision=2)
print("測(cè)試集Recall: ", cnf_matrix[1,1]/(cnf_matrix[1,0]+cnf_matrix[1,1]))
# Plot non-normalized confusion matrix
class_names = [0,1]
plt.figure()
plot_confusion_matrix(cnf_matrix
, classes=class_names
, title='Confusion matrix')
plt.show()
測(cè)試集Recall: 0.9108910891089109
可以看到打颤,recall值約為0.91暴拄,與下采樣時(shí)幾乎是一樣,但是僅有536名正潮嘟龋客戶被誤判為了欺詐客戶乖篷,相比于下采樣9505個(gè)誤判值來(lái)說(shuō),上采樣的方式似乎更優(yōu)透且。
PS:本文代碼參考自《唐宇迪機(jī)器學(xué)習(xí)》