天池:O2O優(yōu)惠券使用預(yù)測(cè)

#coding=utf-8

import os

import pandasas pd

import numpyas np

import sys

from datetimeimport datetime,date

from stringimport Template

from dateutil.parserimport parse

from sklearnimport linear_model

from sklearn.linear_modelimport? Ridge

from sklearn.metricsimport mean_squared_error, r2_score,roc_auc_score

from sklearnimport preprocessing

from sklearn.treeimport DecisionTreeRegressor

from sklearnimport ensemble,svm

reload(sys)

sys.setdefaultencoding("utf-8")

def getDiscountType(row):

if pd.isnull(row):

return 0

? ? elif ':' in row:

return 0

? ? else:

return 1

def convertRate(row):

"""Convert discount to rate"""

? ? if pd.isnull(row):

return 1.0

? ? elif ':' in str(row):

rows = row.split(':')

return 1.0 -float(rows[1]) /float(rows[0])

else:

return float(row)

def getDiscountMan(row):

if ':' in str(row):

rows = row.split(':')

return int(rows[0])

else:

return 0

def getDiscountJian(row):

if ':' in str(row):

rows = row.split(':')

return int(rows[1])

else:

return 0

def getWeekday(row):

if row =='nan':

return np.nan

else:

return date(int(row[0:4]),int(row[4:6]),int(row[6:8])).weekday() +1

pd.set_option('display.max_columns',None)

#設(shè)置目錄

dir=r"D:\zcw\tianchi"

#加載訓(xùn)練數(shù)據(jù)

train_df=pd.read_csv(dir+"\ccf_offline_stage1_train.csv",sep=',',delimiter=',',

dtype= {'User_id':str,'Date':str,'Coupon_id':str,'Date_received':str,'Discount_rate':str,'Distance':str})

#加載線上數(shù)據(jù)

train_ol_df=pd.read_csv(dir+"\ccf_online_stage1_train.csv",sep=',',delimiter=',',

dtype= {'User_id':str,'Coupon_id':str,'Action':str,'Date':str} )

#優(yōu)惠券使用率

train_ol_df_all=train_ol_df.groupby('Coupon_id',as_index=False)['User_id'].count().copy()

train_ol_df_cons=train_ol_df[(train_ol_df['Date']>'2016' )].groupby('Coupon_id',as_index=False)['User_id'].count().copy()

train_ol=pd.merge(train_ol_df_all,train_ol_df_cons,how='left',left_on=['Coupon_id'],right_on=['Coupon_id'])

train_ol['cratio']=train_ol.apply(lambda row: row['User_id_y']/row['User_id_x'],axis=1)

print train_ol.head(10)

#計(jì)算折扣率

train_df['ratio']=train_df['Discount_rate'].map(lambda x:float(str(x).split(':')[1])/float(str(x).split(':')[0])if ':' in str(x)else 0? )

#計(jì)算使用優(yōu)惠券

start=pd.to_datetime(train_df['Date'])

end=pd.to_datetime(train_df['Date_received'])

days=start-end

train_df['days']=days.dt.days

#訓(xùn)練數(shù)據(jù)增加優(yōu)惠券使用率

train_df=pd.merge(train_df,train_ol,how='left',left_on=['Coupon_id'],right_on=['Coupon_id'])

train_df['cratio']=train_df['cratio'].map(lambda x: xif x>0 else 0 )

#缺失補(bǔ)0

train_df['days']=train_df['days'].map(lambda x: xif x>0 else 0 )

#添加標(biāo)簽

train_df['label']=train_df['days'].map(lambda x:'1' if x>0 and x<=15 else '0' )

#距離補(bǔ)0

train_df['Distance']=train_df['Distance'].map(lambda x: xif x>0 else 0 )

# print train_df.head(10)

train_df['discount_man'] = train_df['Discount_rate'].apply(getDiscountMan)

train_df['discount_jian'] = train_df['Discount_rate'].apply(getDiscountJian)

train_df['discount_type'] = train_df['Discount_rate'].apply(getDiscountType)

train_df['weekday'] = train_df['Date_received'].astype(str).apply(getWeekday)

train_df["p1"] = np.array(train_df['weekday'] ==1.0).astype(np.int32)

train_df["p2"] = np.array(train_df['weekday'] ==2.0).astype(np.int32)

train_df["p3"] = np.array(train_df['weekday'] ==3.0).astype(np.int32)

train_df["p4"] = np.array(train_df['weekday'] ==4.0).astype(np.int32)

train_df["p5"] = np.array(train_df['weekday'] ==5.0).astype(np.int32)

train_df["p6"] = np.array(train_df['weekday'] ==6.0).astype(np.int32)

train_df["p7"] = np.array(train_df['weekday'] ==7.0).astype(np.int32)

# dftest2=dftest[['p1','p2','p3','p4','p5','p6','p7']]

del train_df['Date']

del train_df['Date_received']

del train_df['Discount_rate']

# train_df=pd.merge(train_df,train_ol_df, how='left', left_on=['User_id','Coupon_id'], right_on=['User_id','Coupon_id'])

print train_df.head(10)

#切分?jǐn)?shù)據(jù) 1百萬(wàn)作為訓(xùn)練數(shù)據(jù) 并且歸一化處理

diabetes_X_train = preprocessing.scale(train_df.loc[:900000,['ratio','cratio','Distance','discount_man','discount_jian','discount_type','p7','p1','p2','p3','p4','p5','p6']])

diabetes_X_test =? preprocessing.scale(train_df.loc[900000:,['ratio','cratio','Distance','discount_man','discount_jian','discount_type','p7','p1','p2','p3','p4','p5','p6']])

diabetes_y_train = train_df.loc[:900000,['label']]

diabetes_y_test =? train_df.loc[900000:,['label']]

regr = linear_model.LinearRegression()

# regr =svm.SVR(C=1000,)

# regr =DecisionTreeRegressor()? #0.04779884855102401

# regr= linear_model.LogisticRegression(); #0.97

# regr=ensemble.RandomForestRegressor(n_estimators=20,oob_score=True) # 0.04778887161750578

# regr=ensemble.AdaBoostRegressor(n_estimators=50)? #0.04079561330108894

# regr=ensemble.AdaBoostClassifier(n_estimators=50)

# Train the model using the training sets

regr.fit(diabetes_X_train, diabetes_y_train)

# Make predictions using the testing set

diabetes_y_pred = regr.predict(diabetes_X_test)

# diabetes_y_pred=diabetes_y_pred/0.2

# The coefficients

# print('score: \n', roc_auc_score(diabetes_y_test, diabetes_y_pred))

# print('clf: \n', regr.coef_)

#加載預(yù)測(cè)數(shù)據(jù)

test_df=pd.read_csv(dir+"\ccf_offline_stage1_test_revised.csv",sep=',',delimiter=',',

dtype= {'Date':str,'Coupon_id':str,'Date_received':str,'Distance':str})

test_df['ratio']=test_df['Discount_rate'].map(lambda x:float(str(x).split(':')[1])/float(str(x).split(':')[0])if ':' in str(x)else 0? )

test_df['Distance']=test_df['Distance'].map(lambda x: xif x>0 else 0 )

test_df=pd.merge(test_df,train_ol,how='left',left_on=['Coupon_id'],right_on=['Coupon_id'])

test_df['cratio']=test_df['cratio'].map(lambda x: xif x>0 else 0 )

test_df['discount_man'] = test_df['Discount_rate'].apply(getDiscountMan)

test_df['discount_jian'] = test_df['Discount_rate'].apply(getDiscountJian)

test_df['discount_type'] = test_df['Discount_rate'].apply(getDiscountType)

test_df['weekday'] = test_df['Date_received'].astype(str).apply(getWeekday)

test_df["p1"] = np.array(test_df['weekday'] ==1.0).astype(np.int32)

test_df["p2"] = np.array(test_df['weekday'] ==2.0).astype(np.int32)

test_df["p3"] = np.array(test_df['weekday'] ==3.0).astype(np.int32)

test_df["p4"] = np.array(test_df['weekday'] ==4.0).astype(np.int32)

test_df["p5"] = np.array(test_df['weekday'] ==5.0).astype(np.int32)

test_df["p6"] = np.array(test_df['weekday'] ==6.0).astype(np.int32)

test_df["p7"] = np.array(test_df['weekday'] ==7.0).astype(np.int32)

p_X=? preprocessing.scale(test_df.loc[:,['ratio','cratio','Distance','discount_man','discount_jian','discount_type','p7','p1','p2','p3','p4','p5','p6']])

#預(yù)測(cè)

p_Y = regr.predict(p_X)

# print? p_Y[1:10]

test_df['Probability']=p_Y

# print? test_df.head(10)

result_df=test_df[['User_id','Coupon_id','Date_received','Probability']].copy()

result_df['Probability']=result_df['Probability'].map(lambda x: xif x>0 else 0 )

print result_df.head(10)

#保存

result_df.to_csv(dir+"\\result.csv",sep=',',header=False,index=False)

最后編輯于
?著作權(quán)歸作者所有,轉(zhuǎn)載或內(nèi)容合作請(qǐng)聯(lián)系作者
  • 序言:七十年代末破托,一起剝皮案震驚了整個(gè)濱河市,隨后出現(xiàn)的幾起案子歧蒋,更是在濱河造成了極大的恐慌土砂,老刑警劉巖,帶你破解...
    沈念sama閱讀 217,826評(píng)論 6 506
  • 序言:濱河連續(xù)發(fā)生了三起死亡事件谜洽,死亡現(xiàn)場(chǎng)離奇詭異萝映,居然都是意外死亡,警方通過(guò)查閱死者的電腦和手機(jī)阐虚,發(fā)現(xiàn)死者居然都...
    沈念sama閱讀 92,968評(píng)論 3 395
  • 文/潘曉璐 我一進(jìn)店門序臂,熙熙樓的掌柜王于貴愁眉苦臉地迎上來(lái),“玉大人实束,你說(shuō)我怎么就攤上這事奥秆。” “怎么了咸灿?”我有些...
    開(kāi)封第一講書人閱讀 164,234評(píng)論 0 354
  • 文/不壞的土叔 我叫張陵构订,是天一觀的道長(zhǎng)。 經(jīng)常有香客問(wèn)我避矢,道長(zhǎng)悼瘾,這世上最難降的妖魔是什么囊榜? 我笑而不...
    開(kāi)封第一講書人閱讀 58,562評(píng)論 1 293
  • 正文 為了忘掉前任,我火速辦了婚禮亥宿,結(jié)果婚禮上卸勺,老公的妹妹穿的比我還像新娘。我一直安慰自己箩绍,他們只是感情好孔庭,可當(dāng)我...
    茶點(diǎn)故事閱讀 67,611評(píng)論 6 392
  • 文/花漫 我一把揭開(kāi)白布。 她就那樣靜靜地躺著材蛛,像睡著了一般圆到。 火紅的嫁衣襯著肌膚如雪。 梳的紋絲不亂的頭發(fā)上卑吭,一...
    開(kāi)封第一講書人閱讀 51,482評(píng)論 1 302
  • 那天芽淡,我揣著相機(jī)與錄音,去河邊找鬼豆赏。 笑死挣菲,一個(gè)胖子當(dāng)著我的面吹牛,可吹牛的內(nèi)容都是我干的掷邦。 我是一名探鬼主播白胀,決...
    沈念sama閱讀 40,271評(píng)論 3 418
  • 文/蒼蘭香墨 我猛地睜開(kāi)眼,長(zhǎng)吁一口氣:“原來(lái)是場(chǎng)噩夢(mèng)啊……” “哼抚岗!你這毒婦竟也來(lái)了或杠?” 一聲冷哼從身側(cè)響起,我...
    開(kāi)封第一講書人閱讀 39,166評(píng)論 0 276
  • 序言:老撾萬(wàn)榮一對(duì)情侶失蹤宣蔚,失蹤者是張志新(化名)和其女友劉穎向抢,沒(méi)想到半個(gè)月后,有當(dāng)?shù)厝嗽跇?shù)林里發(fā)現(xiàn)了一具尸體胚委,經(jīng)...
    沈念sama閱讀 45,608評(píng)論 1 314
  • 正文 獨(dú)居荒郊野嶺守林人離奇死亡挟鸠,尸身上長(zhǎng)有42處帶血的膿包…… 初始之章·張勛 以下內(nèi)容為張勛視角 年9月15日...
    茶點(diǎn)故事閱讀 37,814評(píng)論 3 336
  • 正文 我和宋清朗相戀三年,在試婚紗的時(shí)候發(fā)現(xiàn)自己被綠了亩冬。 大學(xué)時(shí)的朋友給我發(fā)了我未婚夫和他白月光在一起吃飯的照片艘希。...
    茶點(diǎn)故事閱讀 39,926評(píng)論 1 348
  • 序言:一個(gè)原本活蹦亂跳的男人離奇死亡,死狀恐怖硅急,靈堂內(nèi)的尸體忽然破棺而出枢冤,到底是詐尸還是另有隱情,我是刑警寧澤铜秆,帶...
    沈念sama閱讀 35,644評(píng)論 5 346
  • 正文 年R本政府宣布,位于F島的核電站讶迁,受9級(jí)特大地震影響连茧,放射性物質(zhì)發(fā)生泄漏核蘸。R本人自食惡果不足惜,卻給世界環(huán)境...
    茶點(diǎn)故事閱讀 41,249評(píng)論 3 329
  • 文/蒙蒙 一啸驯、第九天 我趴在偏房一處隱蔽的房頂上張望客扎。 院中可真熱鬧,春花似錦罚斗、人聲如沸徙鱼。這莊子的主人今日做“春日...
    開(kāi)封第一講書人閱讀 31,866評(píng)論 0 22
  • 文/蒼蘭香墨 我抬頭看了看天上的太陽(yáng)袱吆。三九已至,卻和暖如春距淫,著一層夾襖步出監(jiān)牢的瞬間绞绒,已是汗流浹背。 一陣腳步聲響...
    開(kāi)封第一講書人閱讀 32,991評(píng)論 1 269
  • 我被黑心中介騙來(lái)泰國(guó)打工榕暇, 沒(méi)想到剛下飛機(jī)就差點(diǎn)兒被人妖公主榨干…… 1. 我叫王不留蓬衡,地道東北人。 一個(gè)月前我還...
    沈念sama閱讀 48,063評(píng)論 3 370
  • 正文 我出身青樓彤枢,卻偏偏與公主長(zhǎng)得像狰晚,于是被迫代替她去往敵國(guó)和親。 傳聞我的和親對(duì)象是個(gè)殘疾皇子缴啡,可洞房花燭夜當(dāng)晚...
    茶點(diǎn)故事閱讀 44,871評(píng)論 2 354

推薦閱讀更多精彩內(nèi)容