Expedia數(shù)據(jù)挖掘（Kaggle比賽）

1.使用了SGDLR和Random Forest倆種方法
2.結合data leakage
3.最終得分49.999吼鱼，在kaggle排行榜中能排到104位（共1700多隊）
1.leakage solution

# -*- coding: utf-8 -*-
from heapq import nlargest
from operator import itemgetter

def leakage_deal():
    f=open("train.csv", "r")
    f.readline()    
    best_hotels_odd_ulc={}
    best_hotels_miss_odd={}
    best_h00={}
    best_h01={}
    count=0
    #counts
    while 1:
        line=f.readline().strip()
        count+=1
        if line == '':
            break
        arr=line.split(",")
        book_year=int(arr[0][:4])
        book_month=int(arr[0][5:7])
        user_location_city=arr[5]
        orig_destination_distance=arr[6]
        user_id=arr[7]
        srch_destination_id=arr[16]
        hotel_country=arr[21]
        hotel_market=arr[22]
        is_booking=float(arr[18])
        hotel_cluster=arr[23]

        relative_ref_month=((book_year-2012)*12+(book_month-12))
        append_weight=relative_ref_month*relative_ref_month*(3+17.60*is_booking)

        if user_location_city!='' and orig_destination_distance!='' and user_id!='' and srch_destination_id!= '' and hotel_country!= '':
            s00=hash(str(user_id)+':'+str(user_location_city)+':'+str(srch_destination_id)+':'+str(hotel_country)+':'+str(hotel_market))
            if s00 in best_h00:
                if hotel_cluster in best_h00[s00]:
                    best_h00[s00][hotel_cluster] += append_weight
                else:
                    best_h00[s00][hotel_cluster] = append_weight
            else:
                best_h00[s00] = {}
                best_h00[s00][hotel_cluster] = append_weight

        if user_location_city!='' and orig_destination_distance!='' and user_id!='' and srch_destination_id!= '':
            s01=hash(str(user_id)+':'+str(srch_destination_id)+':'+str(hotel_country)+':'+str(hotel_market))
            if s01 in best_h01:
                if hotel_cluster in best_h01[s01]:
                    best_h01[s01][hotel_cluster]+=append_weight
                else:
                    best_h01[s01][hotel_cluster]=append_weight
            else:
                best_h01[s01]={}
                best_h01[s01][hotel_cluster]=append_weight


        if user_location_city!= '' and orig_destination_distance=='' and user_id!='' and srch_destination_id!='' and hotel_country!='':
            s0 = hash(str(user_id)+':'+str(user_location_city)+':'+str(srch_destination_id)+':'+str(hotel_country)+':'+str(hotel_market))
            if s0 in best_hotels_miss_odd:
                if hotel_cluster in best_hotels_miss_odd[s0]:
                    best_hotels_miss_odd[s0][hotel_cluster]+=append_weight
                else:
                    best_hotels_miss_odd[s0][hotel_cluster]=append_weight
            else:
                best_hotels_miss_odd[s0]={}
                best_hotels_miss_odd[s0][hotel_cluster]=append_weight

        if user_location_city!='' and orig_destination_distance!='':
            s1 = hash(str(user_location_city)+':'+str(orig_destination_distance))

            if s1 in best_hotels_odd_ulc:
                if hotel_cluster in best_hotels_odd_ulc[s1]:
                    best_hotels_odd_ulc[s1][hotel_cluster]+=relative_ref_month
                else:
                    best_hotels_odd_ulc[s1][hotel_cluster]=relative_ref_month
            else:
                best_hotels_odd_ulc[s1]={}
                best_hotels_odd_ulc[s1][hotel_cluster]=relative_ref_month

    f.close()
    return best_h00,best_h01, best_hotels_odd_ulc, best_hotels_miss_odd

def submit(best_h00, best_h01, best_hotels_odd_ulc, best_hotels_miss_odd):
    path='leakage_deal.csv'
    out=open(path, "w")
    f=open("test.csv", "r")
    f.readline()
    count=0
    count0=0
    count00=0
    count1=0
    out.write("id,hotel_cluster\n")
    while 1:
        line=f.readline().strip()
        count+=1
        if count % 100000 == 0:
            print('Write {} lines...'.format(count))
        if line == '':
            break
        arr=line.split(",")
        id=arr[0]
        user_location_city=arr[6]
        orig_destination_distance=arr[7]
        user_id=arr[8]
        srch_destination_id=arr[17]
        hotel_country=arr[20]
        hotel_market=arr[21]
        out.write(str(id) + ',')
        filled=[]
        s1=hash(str(user_location_city)+':'+str(orig_destination_distance))
        if s1 in best_hotels_odd_ulc:
            d=best_hotels_odd_ulc[s1]
            topitems=nlargest(5, sorted(d.items()), key=itemgetter(1))
            for i in range(len(topitems)):
                if topitems[i][0] in filled:
                    continue
                if len(filled) == 5:
                    break
                out.write(' ' + topitems[i][0])
                filled.append(topitems[i][0])
                count1 += 1
        if orig_destination_distance == '':
            s0=hash(str(user_id)+':'+str(user_location_city)+':'+str(srch_destination_id)+':'+str(hotel_country)+':'+str(hotel_market))
            if s0 in best_hotels_miss_odd:
                d=best_hotels_miss_odd[s0]
                topitems=nlargest(4, sorted(d.items()), key=itemgetter(1))
                for i in range(len(topitems)):
                    if topitems[i][0] in filled:
                        continue
                    if len(filled) == 5:
                        break
                    out.write(' ' + topitems[i][0])
                    filled.append(topitems[i][0])
                    count0+=1
        s00=hash(str(user_id)+':'+str(user_location_city)+':'+str(srch_destination_id)+':'+str(hotel_country)+':'+str(hotel_market))
        s01=hash(str(user_id)+':'+str(srch_destination_id)+':'+str(hotel_country)+':'+str(hotel_market))
        if s01 in best_h01 and s00 not in best_h00:
            d=best_h01[s01]
            topitems=nlargest(4, sorted(d.items()), key=itemgetter(1))
            for i in range(len(topitems)):
                if topitems[i][0] in filled:
                    continue
                if len(filled) == 5:
                    break
                out.write(' ' + topitems[i][0])
                filled.append(topitems[i][0])
                count00 += 1
        out.write("\n")
    out.close()
    print('count 1=',count1)
    print('count 0=',count0)
    print('count 00=',count00)

best_h00,best_h01, best_hotels_odd_ulc, best_hotels_miss_odd = leakage_deal()
submit(best_h00, best_h01, best_hotels_odd_ulc, best_hotels_miss_odd)

2.Random Forest

# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
import h5py
from sklearn.ensemble import RandomForestClassifier

def pre_deal(data):
    '''data_pre_deal'''
    try:
        data.loc[data.srch_ci.str.endswith('00'),'srch_ci'] = '2015-12-31'
        data['srch_ci'] = data.srch_ci.astype(np.datetime64)
        data.loc[data.date_time.str.endswith('00'),'date_time'] = '2015-12-31'
        data['date_time'] = data.date_time.astype(np.datetime64)
    except:
        pass
    data.fillna(0, inplace=True)
    #calculate the duration in hotel
    data['live_in_days'] = data.srch_co-data.srch_ci
    data['live_in_days'] = data['live_in_days'].apply(lambda ts: ts/np.timedelta64(1,'D'))
    #calculate the time from book to live in the hotel
    data['date_to_live_days'] = data.srch_ci-data.date_time
    data['date_to_live_days'] = data['date_to_live_days'].apply(lambda ts: ts/np.timedelta64(1,'D'))
    data['ci_month'] = data['srch_ci'].apply(lambda dt: dt.month)
    data['ci_day'] = data['srch_ci'].apply(lambda dt: dt.day)
    data['date_month'] = data['date_time'].apply(lambda dt: dt.month)
    data['date_day'] = data['date_time'].apply(lambda dt: dt.day)
    data['date_hour'] = data['date_time'].apply(lambda dt: dt.hour)
    data.drop(['date_time', 'user_id', 'srch_ci', 'srch_co'], axis=1, inplace=True)
import os
if os.path.exists('srch_dest_hc_hm_agg.csv'):
    agg1 = pd.read_csv('srch_dest_hc_hm_agg.csv')
else:
    reader = pd.read_csv('train.csv', parse_dates=['date_time', 'srch_ci', 'srch_co'], chunksize=200000)
    pieces = [chunk.groupby(['srch_destination_id','hotel_country','hotel_market','hotel_cluster'])['is_booking'].agg(['sum','count']) for chunk in reader]
    agg = pd.concat(pieces).groupby(level=[0,1,2,3]).sum()
    agg.dropna(inplace=True)
    agg['sum_and_cnt'] = 0.85*agg['sum'] + 0.15*agg['count']
    agg = agg.groupby(level=[0,1,2]).apply(lambda x: x.astype(float)/x.sum())
    agg.reset_index(inplace=True)
    agg1 = agg.pivot_table(index=['srch_destination_id','hotel_country','hotel_market'], columns='hotel_cluster', values='sum_and_cnt').reset_index()
    agg1.to_csv('srch_dest_hc_hm_agg.csv', index=False)
    #clean memory
    del pieces,agg

destinations = pd.read_csv('destinations.csv')
submission = pd.read_csv('sample_submission.csv')

clf=RandomForestClassifier(n_estimators=100, n_jobs=-1, warm_start=True)
count=0
chunksize=200000
reader = pd.read_csv('train.csv', parse_dates=['date_time', 'srch_ci', 'srch_co'], chunksize=chunksize)
for chunk in reader:
    try:
        chunk = chunk[chunk.is_booking==1]
        chunk = pd.merge(chunk, destinations, how='left', on='srch_destination_id')
        chunk = pd.merge(chunk, agg1, how='left', on=['srch_destination_id','hotel_country','hotel_market'])
        pre_deal(chunk)
        y = chunk.hotel_cluster
        chunk.drop(['cnt', 'hotel_cluster', 'is_booking'], axis=1, inplace=True)
        if len(y.unique()) == 100:
            clf.set_params(n_estimators=clf.n_estimators+1)
            clf.fit(chunk, y)
        count = count + chunksize
        print(count,' have done')
        if(count/chunksize == 300):
            break
    except Exception as e:
        print(str(e))
        pass

count = 0
chunksize = 10000
preds = np.empty((submission.shape[0],clf.n_classes_))
reader = pd.read_csv('test.csv', parse_dates=['date_time', 'srch_ci', 'srch_co'], chunksize=chunksize)
for chunk in reader:
    chunk = pd.merge(chunk, destinations, how='left', on='srch_destination_id')
    chunk = pd.merge(chunk, agg1, how='left', on=['srch_destination_id','hotel_country','hotel_market'])
    chunk.drop(['id'], axis=1, inplace=True)
    pre_deal(chunk)
    pred = clf.predict_proba(chunk)
    preds[count:(count + chunk.shape[0]),:] = pred
    count = count + chunksize
    print(count,' have done')
del clf,agg1

if os.path.exists('rf.h5'):
    with h5py.File('rf.h5', 'r+') as hf:
            predslatesthf = hf['preds_latest']
            preds += predslatesthf.value
            predslatesthf[...] = preds
else:
    with h5py.File('rf.h5', 'w') as hf:
        hf.create_dataset('preds_latest', data=preds)
fea_ind = np.argsort(-preds, axis=1)[:,:5]
happend = [' '.join(row.astype(str)) for row in fea_ind]
submit = pd.DataFrame(data=happend, index=submission.id)
submit.reset_index(inplace=True)
submit.columns = submission.columns
submit.to_csv('rf_deal.csv', index=False)

3.SGDLR

# -*- coding: utf-8 -*-
import pandas as pd
from scipy.sparse import csr_matrix, hstack
import numpy as np
import h5py
import pickle
from sklearn.linear_model import SGDClassifier
import os
cat_col = ['user_id','user_location_city','srch_destination_id','srch_destination_type_id',
           'hotel_continent','hotel_country', 'hotel_market']
num_col = ['is_mobile', 'is_package']
def bin_time(t):
    if t < 0:
        x = 0
    elif t < 2:
        x = 1
    elif t < 7:
        x = 2
    elif t < 30:
        x = 3
    else:
        x = 4
    return x

def pre_process(data):
    try:
        data.loc[data.srch_ci.str.endswith('00'),'srch_ci'] = '2015-12-31'
        data['srch_ci'] = data.srch_ci.astype(np.datetime64)
        data.loc[data.date_time.str.endswith('00'),'date_time'] = '2015-12-31'
        data['date_time'] = data.date_time.astype(np.datetime64)
    except:
        pass
    data.fillna(0, inplace=True)
    data['ci_month'] = data['srch_ci'].apply(lambda dt: dt.month)
    data['season_dest'] = 'season_dest' + data.ci_month.map(str) + '*' + data.srch_destination_id.map(str)
    data['season_dest'] = data['season_dest'].map(hash)
    data['date_to_live_days'] = data.srch_ci-data.date_time
    data['date_to_live_days'] = data['date_to_live_days'].apply(lambda td: td/np.timedelta64(1, 'D'))
    data['date_to_live_days'] = data['date_to_live_days'].map(bin_time)
    data['time_dest'] = 'time_dest' + data.date_to_live_days.map(str) + '*' + data.srch_destination_id.map(str)
    data['time_dest'] = data['time_dest'].map(hash)
    
    for col in cat_col:
        data[col] = col + data[col].map(str)
        data[col] = data[col].map(hash)

submission = pd.read_csv('sample_submission.csv')
cat_col_all = cat_col + ['season_dest', 'time_dest']
def map5eval(preds, actual):
    '''evaluate standard'''
    predicted = preds.argsort(axis=1)[:,-np.arange(5)]
    metric = 0.
    for i in range(5):
        metric += np.sum(actual==predicted[:,i])/(i+1)
    metric /= actual.shape[0]
    return metric

if os.path.exists('sgd.pkl'):
    with open('sgd.pkl', 'rb') as f:
        clf = pickle.load(f)
else:
    clf = SGDClassifier(loss='log', alpha=0.0000025, verbose=0)
#clf.sparsify()
for epoch in range(5):
    count = 0
    chunksize = 200000
    n_features = 3000000
    print('Epoch: ', epoch)
    reader = pd.read_csv('train.csv', parse_dates=['date_time', 'srch_ci', 'srch_co'], chunksize=chunksize)
    for chunk in reader:
        try:
            pre_process(chunk)
            y = chunk.hotel_cluster
            sw = 1 + 4*chunk.is_booking
            chunk.drop(['cnt', 'hotel_cluster', 'is_booking'], axis=1, inplace=True)
            XN = csr_matrix(chunk[num_col].values)
            X = csr_matrix((chunk.shape[0], n_features))
            rows = np.arange(chunk.shape[0])
            for col in cat_col_all:
                dat = np.ones(chunk.shape[0])
                cols = chunk[col] % n_features
                X += csr_matrix((dat, (rows, cols)), shape=(chunk.shape[0], n_features))
            X = hstack((XN, X))
            book_indices = sw[sw > 1].index.tolist()
            x_indices=[(x-count) for x in book_indices]
            X_test = csr_matrix(X)[x_indices]
            y_test = y[book_indices]
            clf.partial_fit(X, y, classes=np.arange(100), sample_weight=sw)         
            count = count + chunksize
            map5 = map5eval(clf.predict_proba(X_test), y_test)
            print((count, map5),' have done')
            if(count/chunksize == 200):
                break
        except Exception as e:
            count = count + chunksize
            print(str(e))
            pass

with open('sgd.pkl', 'wb') as f:
    pickle.dump(clf, f)

count = 0
chunksize = 10000
preds = np.empty((0,100))
reader = pd.read_csv('test.csv', parse_dates=['date_time', 'srch_ci', 'srch_co'], chunksize=chunksize)
for chunk in reader:
    chunk.drop(['id'], axis=1, inplace=True)
    pre_process(chunk)
    XN = csr_matrix(chunk[num_col].values)
    X = csr_matrix((chunk.shape[0], n_features))
    rows = np.arange(chunk.shape[0])
    for col in cat_col_all:
        dat = np.ones(chunk.shape[0])
        cols = chunk[col] % n_features
        X += csr_matrix((dat, (rows, cols)), shape=(chunk.shape[0], n_features))
    X = hstack((XN, X))
    pred = clf.predict_proba(X)
    preds = np.vstack((preds, pred))
    count = count + chunksize
    print(count,' have done')
del clf

if os.path.exists('sgd.h5'):
    with h5py.File('sgd.h5', 'r+') as hf:
        predshf = hf['preds']
        predshf[...] = preds
else:
    with h5py.File('sgd.h5', 'w') as hf:
        hf.create_dataset('preds', data=preds)

col_ind = np.argsort(-preds, axis=1)[:,:5]
hc = [' '.join(row.astype(str)) for row in col_ind]
submit = pd.DataFrame(data=hc, index=submission.id)
submit.reset_index(inplace=True)
submit.columns = submission.columns
submit.to_csv('sgdlr_deal.csv', index=False)

4.blend

# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
from sklearn.preprocessing import normalize
import h5py

submission = pd.read_csv('sample_submission.csv')

# read in RF results
with h5py.File('rf.h5', 'r') as hf:
        predshf = hf['preds_latest']
        preds = 0.54*normalize(predshf.value, norm='l1', axis=1)


# read in SGD results
with h5py.File('../output/probs/allpreds_sgd.h5', 'r') as hf:
        predshf = hf['preds']
        preds += 0.46*normalize(predshf.value, norm='l1', axis=1)


col_ind = np.argsort(-preds, axis=1)[:,:5]
hc = [' '.join(row.astype(str)) for row in col_ind]

sub = pd.DataFrame(data=hc, index=submission.id)
sub.reset_index(inplace=True)
sub.columns = submission.columns
sub.to_csv('blend_deal.csv', index=False)

5.stack

# -*- coding: utf-8 -*-
import pandas as pd

match_pred = pd.read_csv('leakage_deal.csv')
match_pred.fillna('', inplace=True)
match_pred = match_pred['hotel_cluster'].tolist()
match_pred = [s.split(' ') for s in match_pred]

pred_sub = pd.read_csv('blend_deal.csv')
ids = pred_sub.id
pred_sub = pred_sub['hotel_cluster'].tolist()
pred_sub = [s.split(' ') for s in pred_sub]

def f0(seq, idfun=None): 
    if idfun is None:
        def idfun(x): return x
    seen = {}
    result = []
    for item in seq:
        marker = idfun(item)
        if (marker in seen) or (marker == ''): continue
        seen[marker] = 1
        result.append(item)
    return result
    
full_preds = [f0(match_pred[p] + pred_sub[p])[:5] for p in range(len(pred_sub))]

write_p = [" ".join([str(l) for l in p]) for p in full_preds]
write_frame = ["{0},{1}".format(ids[i], write_p[i]) for i in range(len(full_preds))]
write_frame = ["id,hotel_cluster"] + write_frame
with open("final_predictions.csv", "w+") as f:
    f.write("\n".join(write_frame))

結果：

image.png

最后編輯于：2018.10.11 13:00:21

?著作權歸作者所有,轉(zhuǎn)載或內(nèi)容合作請聯(lián)系作者

人面猴
序言：七十年代末爽待，一起剝皮案震驚了整個濱河市练俐，隨后出現(xiàn)的幾起案子，更是在濱河造成了極大的恐慌泪勒，老刑警劉巖搬卒，帶你破解...
沈念sama閱讀 206,723評論 6贊 481
死咒
序言：濱河連續(xù)發(fā)生了三起死亡事件，死亡現(xiàn)場離奇詭異障斋，居然都是意外死亡，警方通過查閱死者的電腦和手機徐鹤，發(fā)現(xiàn)死者居然都...
沈念sama閱讀 88,485評論 2贊 382
救了他兩次的神仙讓他今天三更去死
文/潘曉璐我一進店門垃环，熙熙樓的掌柜王于貴愁眉苦臉地迎上來，“玉大人返敬，你說我怎么就攤上這事晴裹。” “怎么了救赐？”我有些...
開封第一講書人閱讀 152,998評論 0贊 344
道士緝兇錄：失蹤的賣姜人
文/不壞的土叔我叫張陵，是天一觀的道長只磷。經(jīng)常有香客問我经磅，道長，這世上最難降的妖魔是什么钮追？我笑而不...
開封第一講書人閱讀 55,323評論 1贊 279
?港島之戀（遺憾婚禮）
正文為了忘掉前任预厌，我火速辦了婚禮，結果婚禮上元媚，老公的妹妹穿的比我還像新娘轧叽。我一直安慰自己，他們只是感情好刊棕，可當我...
茶點故事閱讀 64,355評論 5贊 374
惡毒庶女頂嫁案：這布局不是一般人想出來的
文/花漫我一把揭開白布炭晒。她就那樣靜靜地躺著，像睡著了一般甥角。火紅的嫁衣襯著肌膚如雪网严。梳的紋絲不亂的頭發(fā)上，一...
開封第一講書人閱讀 49,079評論 1贊 285
城市分裂傳說
那天嗤无，我揣著相機與錄音震束，去河邊找鬼怜庸。笑死，一個胖子當著我的面吹牛垢村，可吹牛的內(nèi)容都是我干的割疾。我是一名探鬼主播，決...
沈念sama閱讀 38,389評論 3贊 400
雙鴛鴦連環(huán)套：你想象不到人心有多黑
文/蒼蘭香墨我猛地睜開眼嘉栓，長吁一口氣：“原來是場噩夢啊……” “哼宏榕！你這毒婦竟也來了？” 一聲冷哼從身側響起胸懈，我...
開封第一講書人閱讀 37,019評論 0贊 259
萬榮殺人案實錄
序言：老撾萬榮一對情侶失蹤担扑，失蹤者是張志新（化名）和其女友劉穎，沒想到半個月后趣钱，有當?shù)厝嗽跇淞掷锇l(fā)現(xiàn)了一具尸體涌献，經(jīng)...
沈念sama閱讀 43,519評論 1贊 300
?護林員之死
正文獨居荒郊野嶺守林人離奇死亡，尸身上長有42處帶血的膿包…… 初始之章·張勛以下內(nèi)容為張勛視角年9月15日...
茶點故事閱讀 35,971評論 2贊 325
?白月光啟示錄
正文我和宋清朗相戀三年首有，在試婚紗的時候發(fā)現(xiàn)自己被綠了燕垃。大學時的朋友給我發(fā)了我未婚夫和他白月光在一起吃飯的照片。...
茶點故事閱讀 38,100評論 1贊 333
活死人
序言：一個原本活蹦亂跳的男人離奇死亡井联，死狀恐怖卜壕，靈堂內(nèi)的尸體忽然破棺而出，到底是詐尸還是另有隱情烙常，我是刑警寧澤轴捎，帶...
沈念sama閱讀 33,738評論 4贊 324
?日本核電站爆炸內(nèi)幕
正文年R本政府宣布，位于F島的核電站蚕脏，受9級特大地震影響侦副，放射性物質(zhì)發(fā)生泄漏。R本人自食惡果不足惜驼鞭，卻給世界環(huán)境...
茶點故事閱讀 39,293評論 3贊 307
男人毒藥：我在死后第九天來索命
文/蒙蒙一秦驯、第九天我趴在偏房一處隱蔽的房頂上張望。院中可真熱鬧挣棕，春花似錦译隘、人聲如沸。這莊子的主人今日做“春日...
開封第一講書人閱讀 30,289評論 0贊 19
一樁弒父案固耘，背后竟有這般陰謀
文/蒼蘭香墨我抬頭看了看天上的太陽。三九已至皂甘，卻和暖如春玻驻，著一層夾襖步出監(jiān)牢的瞬間，已是汗流浹背。一陣腳步聲響...
開封第一講書人閱讀 31,517評論 1贊 262
情欲美人皮
我被黑心中介騙來泰國打工璧瞬，沒想到剛下飛機就差點兒被人妖公主榨干…… 1. 我叫王不留户辫，地道東北人。一個月前我還...
沈念sama閱讀 45,547評論 2贊 354
代替公主和親
正文我出身青樓嗤锉，卻偏偏與公主長得像渔欢，于是被迫代替她去往敵國和親。傳聞我的和親對象是個殘疾皇子瘟忱，可洞房花燭夜當晚...
茶點故事閱讀 42,834評論 2贊 345

Expedia數(shù)據(jù)挖掘（Kaggle比賽）

推薦閱讀更多精彩內(nèi)容