import pandasas pd
import matplotlib.pyplotas plt
import numpyas np
data= pd.read_csv("creditcard.csv")
a=pd.value_counts(data["Class"])
count_classes= pd.value_counts(data['Class'], sort = True).sort_index()
from sklearn.preprocessingimport StandardScaler
# 1、StandardScaler就是z-score方法
# 將原始數(shù)據(jù)歸一化為均值為0,方差為1的數(shù)據(jù)集 并將之存儲到Amount列
data['normAmount'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1, 1))
#? 刪除數(shù)據(jù)中Time? Amount 列
# 刪除沒用的兩列數(shù)據(jù)九杂,得到一個新的數(shù)據(jù)集
data= data.drop(['Time','Amount'],axis=1)
# 先對數(shù)據(jù)進行切分
X= data.ix[:, data.columns!= 'Class']
y= data.ix[:, data.columns== 'Class']
# 隨機下采樣
# 篩選出class為1的數(shù)據(jù)總數(shù)蝌箍,并取得其索引值
# Number of data points in the minority class
# 統(tǒng)計異常值得個數(shù)
number_records_fraud= len(data[data.Class== 1])
# 統(tǒng)計欺詐樣本的下標沈善,并變成矩陣的格式:
fraud_indices= np.array(data[data.Class== 1].index)
# Picking the indices of the normal classes
# 記錄正常值的下標:
# 把class為0的數(shù)據(jù)索引拿到手
normal_indices= data[data.Class== 0].index
# Out of the indices we picked, randomly select "x" number (number_records_fraud)
# 從normal_indices中抽取number_records_fraud
# 從正常值的索引中沈贝,選擇和異常值相等個數(shù)的樣本谆吴,保證樣本的均衡:
# np.random.choice(a,size, replace, p):在a中以概率p隨機選擇size個數(shù)據(jù),replace是指是否有放回攘蔽;
random_normal_indices= np.random.choice(normal_indices, number_records_fraud, replace = False)
# 將數(shù)據(jù)轉(zhuǎn)換成數(shù)組:
# 轉(zhuǎn)換成numpy的array格式
random_normal_indices= np.array(random_normal_indices)
# Appending the 2 indices
# fraud_indices:欺詐樣本的下標;random_normal_indices:正常值數(shù)組呐粘;
# concatenate:數(shù)據(jù)庫的拼接满俗;axis=1:按照對應(yīng)行的數(shù)據(jù)進行拼接;
# 將兩組索引數(shù)據(jù)連接成性的數(shù)據(jù)索引
under_sample_indices= np.concatenate([fraud_indices,random_normal_indices])
# Under sample dataset
# loc["a","b"]:表示第a行作岖,第b列唆垃;
# iloc[1,1]:按照行列來索引,左式為第二行第二列痘儡;
# 獲取下標所在行的所有列辕万,即得到訓(xùn)練所需要的數(shù)據(jù)集:
# 下采樣數(shù)據(jù)集
# 定位到真正的數(shù)據(jù)
under_sample_data= data.iloc[under_sample_indices,:]
# 將數(shù)據(jù)集按照class列進行分類
# 切分出下采樣數(shù)據(jù)的特征和標簽
X_undersample= under_sample_data.ix[:, under_sample_data.columns!= 'Class']
y_undersample= under_sample_data.ix[:, under_sample_data.columns== 'Class']
# Showing ratio
# 展示下比例
# 計算正負比例為0.5
print("Percentage of normal transactions: ", len(under_sample_data[under_sample_data.Class== 0])/len(under_sample_data))
print("Percentage of fraud transactions: ", len(under_sample_data[under_sample_data.Class== 1])/len(under_sample_data))
print("Total number of transactions in resampled data: ", len(under_sample_data))
# 導(dǎo)入交叉驗證模塊的數(shù)據(jù)切分
from sklearn.model_selectionimport train_test_split
# Whole dataset
# 交叉驗證
# 隨機劃分訓(xùn)練集和測試集:x為除了class之外的其他的值,y為最終的結(jié)果列沉删;
# test_size:樣本占比渐尿;
# 從原始集中獲取到訓(xùn)練集與測試集:
# train_test_split:x,y按照test_size的尺寸隨機提取數(shù)據(jù),然后劃分到四個數(shù)據(jù)集中
# 對全部數(shù)據(jù)集進行切分丑念,注意使用相同的隨機策略
X_train, X_test, y_train, y_test= train_test_split(X,y,test_size = 0.3, random_state = 0)
print("Number transactions train dataset: ", len(X_train))
print("Number transactions test dataset: ", len(X_test))
print("Total number of transactions: ", len(X_train)+len(X_test))
# Undersampled dataset
# 數(shù)據(jù)平衡之后的數(shù)據(jù)中獲取到訓(xùn)練集與測試集:
# 對下采樣數(shù)據(jù)集進行切分
X_train_undersample, X_test_undersample, y_train_undersample, y_test_undersample= train_test_split(X_undersample
,y_undersample
,test_size = 0.3
? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ,random_state = 0)
print("")
print("Number transactions train dataset: ", len(X_train_undersample))
print("Number transactions test dataset: ", len(X_test_undersample))
print("Total number of transactions: ", len(X_train_undersample)+len(X_test_undersample))