使用的數(shù)據(jù)是阿里云天池的數(shù)據(jù)(https://tianchi.aliyun.com/dataset/dataDetail?dataId=56)土榴,數(shù)據(jù)中包含了四張表,分別為用戶行為日志behavior_log(簡(jiǎn)稱為bl)、原始樣本骨架raw_sample(簡(jiǎn)稱為rs)搜囱、廣告基本信息表ad_feature(簡(jiǎn)稱為af)、用戶基本信息表user_profile(簡(jiǎn)稱為up)趾痘。
下面僅嘗試使用一下隨機(jī)森林進(jìn)行簡(jiǎn)單的預(yù)測(cè),所以將缺失值直接刪除蔓钟,最后預(yù)測(cè)效果不錯(cuò)永票,準(zhǔn)確率高達(dá)93.95%。
代碼如下:
import pandas as pd
import numpy as np
import datetime
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')? #忽視警告
up = pd.read_csv(r'E:\datafile\ad_clik\user_profile.csv')
af = pd.read_csv(r'E:\datafile\ad_clik\ad_feature.csv')
rs = pd.read_csv(r'E:\datafile\ad_clik\raw_sample.csv',iterator=True,chunksize=10000000,header=0)
# 提取出全非空的數(shù)據(jù)滥沫,395932
u = []? #665836個(gè)缺失
u.extend(user_class_null)
u.extend(user_pvalue_null)
u.extend(user_class_pvalue_null)
complete_up = up[~up['userid'].isin(u)]
# 各用戶特征的分布情況侣集,還有廣告屬性的分布情況,此處省略
vector=['cms_segid','cms_group_id','final_gender_code','age_level','pvalue_level','shopping_level','occupation','new_user_class_level ']
%matplotlib inline
for i in vector:
? ? y = complete_up[i].value_counts().reset_index()
? ? y.columns = [i,'person_count']
? ? y.sort_values(by=i,ascending = True)
? ? x = y[i].tolist()
? ? cou = y['person_count'].tolist()
? ? plt.figure(figsize=(15,8))
? ? plt.bar(x,cou)
? ? plt.show()
# 設(shè)置訓(xùn)練集
t1 = '2017-05-06 00:00:00'
t2 = '2017-05-12 23:59:59'
f = '%Y-%m-%d %H:%M:%S'
startTime = datetime.datetime.strptime(t1,f).timestamp()? #1494000000.0
endTime = datetime.datetime.strptime(t2,f).timestamp()? ? #1494604799.0
# 只要complete_up表中的userid
u = complete_up['userid'].tolist()
# 不要af表中的brand缺失的adgroup_id
a = af[af['brand'].isnull()]['adgroup_id'].tolist()
count = 0
for chunk in rs:
? ? chunk.drop(index=chunk[chunk.time_stamp < startTime].index,inplace=True)
? ? chunk.drop(index=chunk[chunk.time_stamp > endTime].index,inplace=True)
? ? chunk.drop(index=chunk[chunk['adgroup_id'].isin(a)].index,inplace=True)
? ? chunk.drop(index=chunk[~chunk['user'].isin(u)].index,inplace=True)
? ? list = []
? ? for i in chunk.time_stamp.index:
? ? ? ? d = chunk.time_stamp[i]
? ? ? ? dates = datetime.datetime.fromtimestamp(d)
? ? ? ? list.append(dates)
? ? chunk.insert(loc=3,column='datetimes',value=list)
? ? del chunk['time_stamp']
? ? chunk.to_csv('E:\\datafile\\rs\\rs_train_complete.csv',mode='a',index=False,header=0)? #header=0,是布爾類型兰绣,表示不加入列名
? ? count += 1
? ? print(count,end='-')
print('ok')
# 連接up和af
rs_train = pd.read_csv('E:\\datafile\\rs\\rs_train_complete.csv',header=None,
? ? ? ? ? ? ? ? ? ? ? names=['userid','adgroup_id','datatimes','pid','noclk','clk'])
df = pd.merge(rs_train,up,how='left',on='userid')
df = pd.merge(df,af,how='left',on='adgroup_id')
### 由于有的廣告屬性的取值特別多肚吏,可以根據(jù)點(diǎn)擊量和點(diǎn)擊率進(jìn)行分桶,做數(shù)據(jù)轉(zhuǎn)換
# 先計(jì)算cate_id的點(diǎn)擊量和點(diǎn)擊率
cate_y = df['cate_id'][df['clk']==1].value_counts().reset_index()
cate_y.columns = ['cate_id','clk']
# cate_n = df['cate_id'][df['clk']==0].value_counts().reset_index()
# cate_n.columns = ['cate_id','nclk']
cate_sum = df['cate_id'].value_counts().reset_index()
cate_sum.columns = ['cate_id','counts']
cate = pd.merge(cate_y,cate_sum,how='outer',on='cate_id')
cate = cate.fillna(0)
cate['clk_ratio'] = cate['clk']/cate['counts']
cate['clk_ratio'] = cate['clk_ratio'].map(lambda x:('%.4f')%x)
cate['clk_ratio'] = cate['clk_ratio'].astype(float)
cate['cate_clk_bins'] = pd.qcut(cate['clk'],16,duplicates='drop',labels=[1,2,3,4,5,6,7,8,9,10])
cate['cate_clk_bins'] = cate['cate_clk_bins'].astype(int)
cate['cate_clk_ratio_bins'] = pd.qcut(cate['clk_ratio'],14,duplicates='drop',labels=[1,2,3,4,5,6,7,8,9,10])
cate['cate_clk_ratio_bins'] = cate['cate_clk_ratio_bins'].astype(int)
cate.drop(['clk','counts','clk_ratio'],axis=1,inplace=True)
# 先計(jì)算cate_id的點(diǎn)擊量和點(diǎn)擊率
cust_y = df['customer'][df['clk']==1].value_counts().reset_index()
cust_y.columns = ['customer','clk']
# cate_n = df['cate_id'][df['clk']==0].value_counts().reset_index()
# cate_n.columns = ['cate_id','nclk']
cust_sum = df['customer'].value_counts().reset_index()
cust_sum.columns = ['customer','counts']
cust = pd.merge(cust_y,cust_sum,how='outer',on='customer')
cust = cust.fillna(0)
cust['clk_ratio'] = cust['clk']/cust['counts']
cust['clk_ratio'] = cust['clk_ratio'].map(lambda x:('%.4f')%x)
cust['clk_ratio'] = cust['clk_ratio'].astype(float)
cust['cust_clk_bins'] = pd.qcut(cust['clk'],65,duplicates='drop',labels=[1,2,3,4,5,6,7,8,9,10])
cust['cust_clk_bins'] = cust['cust_clk_bins'].astype(int)
cust['cust_clk_ratio_bins'] = pd.qcut(cust['clk_ratio'],26,duplicates='drop',labels=[1,2,3,4,5,6,7,8,9,10])
cust['cust_clk_ratio_bins'] = cust['cust_clk_ratio_bins'].astype(int)
cust.drop(['clk','counts','clk_ratio'],axis=1,inplace=True)
# 先計(jì)算campaign_id的點(diǎn)擊量和點(diǎn)擊率
camp_y = df['campaign_id'][df['clk']==1].value_counts().reset_index()
camp_y.columns = ['campaign_id','clk']
# cate_n = df['cate_id'][df['clk']==0].value_counts().reset_index()
# cate_n.columns = ['cate_id','nclk']
camp_sum = df['campaign_id'].value_counts().reset_index()
camp_sum.columns = ['campaign_id','counts']
camp = pd.merge(camp_y,camp_sum,how='outer',on='campaign_id')
camp = camp.fillna(0)
camp['clk_ratio'] = camp['clk']/camp['counts']
camp['clk_ratio'] = camp['clk_ratio'].map(lambda x:('%.4f')%x)
camp['clk_ratio'] = camp['clk_ratio'].astype(float)
camp['camp_clk_bins'] = pd.qcut(camp['clk'],100,duplicates='drop',labels=[1,2,3,4,5,6,7,8,9,10])
camp['camp_clk_bins'] = camp['camp_clk_bins'].astype(int)
# camp['clk_bins'].unique().size
camp['camp_clk_ratio_bins'] = pd.qcut(camp['clk_ratio'],30,duplicates='drop',labels=[1,2,3,4,5,6,7,8,9,10])
camp['camp_clk_ratio_bins'] = camp['camp_clk_ratio_bins'].astype(int)
#camp['clk_ratio_bins'].unique().size
camp.drop(['clk','counts','clk_ratio'],axis=1,inplace=True)
# 先計(jì)算campaign_id的點(diǎn)擊量和點(diǎn)擊率
brand_y = df['brand'][df['clk']==1].value_counts().reset_index()
brand_y.columns = ['brand','clk']
# cate_n = df['cate_id'][df['clk']==0].value_counts().reset_index()
# cate_n.columns = ['cate_id','nclk']
brand_sum = df['brand'].value_counts().reset_index()
brand_sum.columns = ['brand','counts']
brand = pd.merge(brand_y,brand_sum,how='outer',on='brand')
brand = brand.fillna(0)
brand['clk_ratio'] = brand['clk']/brand['counts']
brand['clk_ratio'] = brand['clk_ratio'].map(lambda x:('%.4f')%x)
brand['clk_ratio'] = brand['clk_ratio'].astype(float)
brand['brand_clk_bins'] = pd.qcut(brand['clk'],40,duplicates='drop',labels=[1,2,3,4,5,6,7,8,9,10])
brand['brand_clk_bins'] = brand['brand_clk_bins'].astype(int)
# brand['clk_bins'].unique().size
brand['brand_clk_ratio_bins'] = pd.qcut(brand['clk_ratio'],22,duplicates='drop',labels=[1,2,3,4,5,6,7,8,9,10])
brand['brand_clk_ratio_bins'] = brand['brand_clk_ratio_bins'].astype(int)
# brand['clk_ratio_bins'].unique().size
brand.drop(['clk','counts','clk_ratio'],axis=1,inplace=True)
### 相關(guān)性分析
from sklearn.feature_selection import chi2,SelectKBest
X = t[['cms_segid','cms_group_id','final_gender_code','age_level','pvalue_level','shopping_level','occupation','new_user_class_level ',
? ? ? 'cate_clk_bins','cate_clk_ratio_bins','cust_clk_bins','cust_clk_ratio_bins','camp_clk_bins','camp_clk_ratio_bins','brand_clk_bins',
? ? ? 'brand_clk_ratio_bins']].values
print(X.shape)
y = t['clk'].tolist()
# selector = SelectKBest(chi2,k='all')
# selector.fit(X, y)
# scores = selector.scores_
# scores? #第4狭魂、5、6、7雌澄、8個(gè)特征相關(guān)性并不明顯
# pvalues = selector.pvalues_
# pvalues? #p值都小于0.05
selector = SelectKBest(chi2,k=11)
v = selector.fit(X, y).get_support(indices=True)
print(v)
scores = selector.scores_
print(scores)? #第4斋泄、5、6镐牺、7炫掐、8個(gè)特征相關(guān)性并不明顯
### 同時(shí)使用SPSS也驗(yàn)證了price和clk也相關(guān)
### 預(yù)測(cè)
t = pd.merge(df,cate,how='left',on='cate_id')
t = pd.merge(t,cust,how='left',on='customer')
t = pd.merge(t,camp,how='left',on='campaign_id')
t = pd.merge(t,brand,how='left',on='brand')
todrop = ['userid','adgroup_id','datatimes','noclk','cate_id','campaign_id','customer','brand']
t.drop(todrop,axis=1,inplace=True)
## 先使用訓(xùn)練集進(jìn)行交叉驗(yàn)證
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score,train_test_split
# 只取定類數(shù)據(jù)中相關(guān)性強(qiáng)的前11個(gè)特征,加上pid和price
rf_x = t[['cms_segid','cms_group_id','final_gender_code','cate_clk_bins','cate_clk_ratio_bins','cust_clk_bins','cust_clk_ratio_bins',
? ? ? ? ? 'camp_clk_bins','camp_clk_ratio_bins','brand_clk_bins','brand_clk_ratio_bins','pid','price']].values
rf_y = t['clk'].tolist()
train_X,test_X, train_y, test_y = train_test_split(rf_x,rf_y,test_size=1/5)
clf1 = RandomForestClassifier(n_estimators=10,max_depth=None,min_samples_split=2,random_state=0)
scores = cross_val_score(clf1,train_X,train_y,scoring='accuracy',cv=5)
clf1.fit(train_X,train_y)
y_pred = clf1.predict(test_X)
print(scores.mean())? # 0.9394756096191237
print(scores.std())? # 0.00022112156289561522
test = pd.DataFrame([y_pred,test_y],index=['y_pred','test_y'])
test = test.T
print(test[(y_pred!=test_y) & (y_pred==1)]['y_pred'].size)
print(test[(y_pred!=test_y) & (y_pred==0)]['y_pred'].size)
print('預(yù)測(cè)準(zhǔn)確率:',test[(y_pred==test_y)]['y_pred'].size/test['y_pred'].size)? # 0.9395123965128687
## 再預(yù)測(cè)測(cè)試集
rs_test = pd.DataFrame()
# 設(shè)置提取的時(shí)間段
t1 = '2017-05-13 00:00:00'
t2 = '2017-05-13 23:59:59'
f = '%Y-%m-%d %H:%M:%S'
startTime = datetime.datetime.strptime(t1,f).timestamp()? #1494000000.0
endTime = datetime.datetime.strptime(t2,f).timestamp()? ? #1494604799.0
# 只要complete_up表中的userid
u = complete_up['userid'].tolist()
# 不要af表中的brand缺失的adgroup_id
a = af[af['brand'].isnull()]['adgroup_id'].tolist()
count = 0
for chunk in rs:
? ? chunk.drop(index=chunk[chunk.time_stamp < startTime].index,inplace=True)
? ? chunk.drop(index=chunk[chunk.time_stamp > endTime].index,inplace=True)
? ? chunk.drop(index=chunk[chunk['adgroup_id'].isin(a)].index,inplace=True)
? ? chunk.drop(index=chunk[~chunk['user'].isin(u)].index,inplace=True)
? ? list = []
? ? for i in chunk.time_stamp.index:
? ? ? ? d = chunk.time_stamp[i]
? ? ? ? dates = datetime.datetime.fromtimestamp(d)
? ? ? ? list.append(dates)
? ? chunk.insert(loc=3,column='datetimes',value=list)
? ? del chunk['time_stamp']
? ? rs_test = pd.concat([rs_test,chunk])
? ? count += 1
? ? print(count,end='-')
print('ok')
rs_test.columns = ['userid','adgroup_id','datetimes','pid','nonclk','clk']
temp = pd.merge(rs_test,up,how='left',on='userid')
rf_test = pd.merge(temp,af,how='left',on='adgroup_id')
temp = pd.merge(rf_test,cate,how='left',on='cate_id')
temp = pd.merge(temp,cust,how='left',on='customer')
temp = pd.merge(temp,camp,how='left',on='campaign_id')
rf_test = pd.merge(temp,brand,how='left',on='brand')
todrop = ['userid','adgroup_id','datetimes','nonclk','cate_id','campaign_id','customer','brand','age_level','pvalue_level',
? ? ? ? ? 'shopping_level','occupation','new_user_class_level ']
rf_test.drop(todrop,axis=1,inplace=True)
# 刪除那些沒有有缺失值的
RF_test = rf_test.dropna()
# 找出沒有匹配到的數(shù)據(jù)? 18451條
test_null = rf_test[rf_test.isnull().T.any()]
test_null.describe()
test_x = RF_test[['cms_segid','cms_group_id','final_gender_code','cate_clk_bins','cate_clk_ratio_bins','cust_clk_bins','cust_clk_ratio_bins',
? ? ? ? ? 'camp_clk_bins','camp_clk_ratio_bins','brand_clk_bins','brand_clk_ratio_bins','pid','price']].values
test_y = RF_test['clk'].tolist()
y_pred = clf1.predict(test_x)
test = pd.DataFrame([y_pred,test_y],index=['y_pred','test_y'])
test = test.T
print(test[(y_pred!=test_y) & (y_pred==1)]['y_pred'].size)
print(test[(y_pred!=test_y) & (y_pred==0)]['y_pred'].size)
print('預(yù)測(cè)準(zhǔn)確率:',test[(y_pred==test_y)]['y_pred'].size/test['y_pred'].size)? # 0.9395123965128687