airport-天池

import pandas as pd
from dateutil.parser import parse 
import datetime
import numpy as np
import pylab as pl
from sympy import *

import matplotlib
import numpy as np
import matplotlib.pyplot as plt

導(dǎo)入并預(yù)處理數(shù)據(jù)

分析wifi每天的變化葵擎,可知在凌晨4點(diǎn)是安檢荣病,航班以及wifi連接數(shù)為0抗愁,因此將其作為分界點(diǎn)

def imp_dat():
    departure=pd.read_csv("E:\\data_learn\\tianchi_stround\\da2\\airport_gz_departure_chusai_2ndround.csv")
    flights=pd.read_csv("E:\\data_learn\\tianchi_stround\\da2\\airport_gz_flights_chusai_2ndround.csv")
    gates=pd.read_csv("E:\\data_learn\\tianchi_stround\\da2\\airport_gz_gates.csv")
    security_check=pd.read_csv("E:\\data_learn\\tianchi_stround\\da2\\airport_gz_security_check_chusai_2ndround.csv")
    wifi_records=pd.read_csv("E:\\data_learn\\tianchi_stround\\da2\\WIFI_AP_Passenger_Records_chusai_2ndround.csv")
    return departure,flights,gates,security_check,wifi_records
# 將wifi連接上10分鐘內(nèi)平均
def mean_wifi(wifi_records):
    rlines=wifi_records['timeStamp'].values
    timeTen=[]
    timeStr=[]
    for i in rlines:
        timeTen.append(int(i[8:16].replace('-',''))/10*10)
        timeStr.append(i[:10])
    wifi_records['timeTen']=timeTen
    wifi_records['timeStr']=timeStr
    dfwftime=wifi_records.groupby(['WIFIAPTag','timeTen','timeStr'])
    dftenMean=dfwftime['passengerCount'].mean().reset_index()
    timeTen_str=dftenMean['timeTen'].values.astype(str)
    timeTen_str4=[]
    for i in timeTen_str:
            timeTen_str4.append(i[2:])
    dftenMean['timeTen_str']=timeTen_str
    dftenMean['timeTen_str4']=timeTen_str4
    slice10min=[]
    dftimeStr=dftenMean['timeStr'].values
    dftimeTen_str4=dftenMean['timeTen_str4'].values
    for i in xrange(dftimeStr.shape[0]):
        slice10min.append(dftimeStr[i]+'-'+dftimeTen_str4[i][:2]+'-'+dftimeTen_str4[i][2])
    dftenMean['slice10min']=slice10min
    #### delete the surperfluous data and show the "E1-1A-1<E1-1-01> "data
    wifi_all=dftenMean.drop(['timeTen','timeStr','timeTen_str','timeTen_str4'],axis=1)
    #wifi_all['timeTen_str4']=wifi_all['timeTen_str4'].astype(int)
    ### 查看不同點(diǎn)的wifi數(shù)量
    df=wifi_all.groupby(['WIFIAPTag','slice10min'])
    rse=df.passengerCount.sum()
    wifi_all_split=rse.unstack().T
    return wifi_all_split
# 按天將wifi連接數(shù)分開(kāi)
def getWIFIday11_24(wifi_all_split):    
    wifi_day=[]
    wifi_dayt=[]
    for i in range(10,26):
        tmp=[a  for a in wifi_all_split.index if a>='2016-09-'+str(i)+'-04-0' and a<='2016-09-'+str(i+1)+'-04-0']
        tmpt=[a  for a in wifi_all_split.index if a<='2016-09-'+str(i)+'-17-5' and a>='2016-09-'+str(i)+'-15-0']
        wifi_day.append(wifi_all_split.ix[tmp,:])
        wifi_dayt.append(wifi_all_split.ix[tmpt,:])
    return wifi_day,wifi_dayt
departure,flights,gates,security_check,wifi_records=imp_dat()
wifi_all_split=mean_wifi(wifi_records)
wifi_day,wifi_dayt=getWIFIday11_24(wifi_all_split)
# 合并航班與登機(jī)口區(qū)域
def getFla_gat(flights,gates):
    scheduled_flt=[parse(a)+datetime.timedelta(hours=8)  if type(a)==str else 0 for a in flights['scheduled_flt_time'].values]
    actual_flt=[parse(a)+datetime.timedelta(hours=8) if type(a)==str else 0  for a in flights['actual_flt_time'].values]
    flights['scheduled_flt'],flights['acutal_flt']=scheduled_flt,actual_flt

    flight_gate=pd.merge(flights,gates,on='BGATE_ID',how='left')
    oneDay_time=[str(a)[11:19] for a in flight_gate['scheduled_flt'].values]
    flight_gate['timeInDay']=oneDay_time

    late_timeAll=[]
    for i in range(flight_gate.shape[0]):
        if flight_gate.ix[i,5]!=0:
            late_timeAll.append(round((flight_gate.ix[i,5]-flight_gate.ix[i,4]).total_seconds(),0)/60)
        else:
            late_timeAll.append(-1)    
    flight_gate['late_time/min']=late_timeAll
    id_flt=[]
    for a in flight_gate.ix[:,['scheduled_flt','BGATE_ID']].astype(str).values:
        id_flt.append(a[0][-8:]+'_'+a[1])
    flight_gate['id_flt']=id_flt
    tmp=flight_gate
    del tmp['scheduled_flt_time']
    del tmp['actual_flt_time']
    return tmp
def separate_flight(flight_gate):
    all_=[]
    plane_fight_dic=[]
    for i in xrange(10,26):
        logi=[flight_gate.ix[a,'scheduled_flt']>=datetime.datetime(2016,9,i,4,0) and flight_gate.ix[a,'scheduled_flt']<datetime.datetime(2016,9,i+1,4,0)
              for a in xrange(flight_gate.shape[0])]
        ftmp=flight_gate[logi]
        tmp_dic={}
        for i in ftmp.ix[:,['flight_ID','id_flt']].values:
            tmp_dic[i[1]]=tmp_dic.get(i[1],[])+[i[0]]
        tmp_set=[]
        for i in ftmp['id_flt'].values:
            tmp_set.append(tmp_dic[i])
        ftmp['flt_set']=tmp_set
        ftmp['area_gate']=(ftmp['BGATE_AREA']+ftmp['BGATE_ID']).values
        all_.append(ftmp)
        plane_fight_dic.append(tmp_dic)        
    return all_,plane_fight_dic
flight_gate=getFla_gat(flights,gates)
fl_gt,plane_flight_dic=separate_flight(flight_gate)
D:\Anaconda2\lib\site-packages\ipykernel\__main__.py:56: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
D:\Anaconda2\lib\site-packages\ipykernel\__main__.py:57: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
# res_new 為安檢數(shù)较性,航班的合并表
res=np.load('res_new.npy')

''' 合并得到列為:id_flt   0   BGATE_ID    scheduled_flt   acutal_flt  BGATE_AREA  timeInDay   late_time/min   area_gate   id_concat的表
其中0表示此機(jī)型(非航班)對(duì)應(yīng)所有的安檢人數(shù),id_concat為此機(jī)型對(duì)應(yīng)的航班
'''
def fli_gat_count():
    n=len(res[0])
    all_=[]
    for i in xrange(n):
        res_cum=res[0][i].groupby(['secTime','id_flt']).size().unstack().resample('10T').sum().fillna(0).cumsum().max().reset_index()
        tmp_count=pd.merge(res_cum,fl_gt[i],how='right',on='id_flt')
        del tmp_count['flight_ID']
        tm=[]            
        for  ivs in tmp_count['flt_set'].values:
            s_=''
            for si in ivs:
                s_=s_+'_'+si             
            tm.append(s_)
        tmp_count['id_concat']=tm
        del tmp_count['flt_set']
        tmp_count=tmp_count.drop_duplicates()
        all_.append(tmp_count)
    return all_
fl_count=fli_gat_count()

建立模型

假設(shè) 每個(gè)機(jī)型對(duì)該wifi點(diǎn)影響數(shù)為起飛前3小時(shí)壶栋,總?cè)藬?shù)如果為N,那前3小時(shí)每10分鐘的wifi比例為a_i(i=1,2,...,18).如果延期辰如,那么延期的時(shí)間段內(nèi)wifi比例一直為a_18,即最后一個(gè)比例值贵试。將該候車(chē)廳所有機(jī)型對(duì)應(yīng)的人數(shù)相加琉兜,與實(shí)際wifi數(shù)對(duì)應(yīng);最后,使用簡(jiǎn)單線性回歸求取參數(shù)a_i.

每個(gè)機(jī)型對(duì)全天wifi_count的影響函數(shù)

passAr=np.array(getpassAr())*15
passAr
array([15*a_0, 15*a_1, 15*a_2, 15*a_3, 15*a_4, 15*a_5, 15*a_6, 15*a_7,
       15*a_8, 15*a_9, 15*a_10, 15*a_11, 15*a_12, 15*a_13, 15*a_14,
       15*a_15, 15*a_16, 15*a_17, 15*a_18, 15*a_19, 15*a_20, 15*a_21,
       15*a_22, 15*a_23, 15*a_24], dtype=object)
n=30;parN=25;rN=19;n_last=1;
#np.array([0]*(n-parN)+list(passAr[:rN])+list(passAr[rN])*n_last+list(passAr[-6:])+(144-n-n_last)*[0])
passAr[rN]
15*a_19
# N為該航班總?cè)藬?shù)
from dateutil.parser import parse
parN=25# 參數(shù)個(gè)數(shù)
def getpassCount(sctN):
    sct,act,N=sctN
    passAr=np.array(getpassAr())*N
    n=(sct-parse(str(sct.date())+' 04:00:00')).seconds/600
    n_last=(act-sct).seconds/600
    rN=parN-6
    if n<=parN and (n+n_last)<=144:
        return np.array(list(passAr[-n:-6])+[passAr[-6]]*n_last+list(passAr[-6:])*(144-n-n_last-6)*[0])
    elif n<=parN and (n+n_last)>150:
        return np.array(list(passAr[-n:-6])+[passAr[rN]]*(150-n))
    elif  n<=parN and (n+n_last)<150 and (n+n_last)>144:
        return np.array(list(passAr[-n:-6])+[passAr[rN]]*n_last+list(passAr[rN:(rN-n-n_last+150)]))
    elif n>parN and (n+n_last)<=144:
        return np.array([0]*(n-parN)+list(passAr[:rN])+[passAr[rN]]*n_last+list(passAr[-6:])+(144-n-n_last)*[0])
    elif n>parN and (n+n_last)>150:
        return np.array([0]*(n-parN)+list(passAr[:rN])+[passAr[rN]]*(150-n))
    elif n>parN and (n+n_last)<150 and (n+n_last)>144:
        return np.array([0]*(n-parN)+list(passAr[:rN])+[passAr[-6]]*(n_last)+list(passAr[rN:(rN-n-n_last+150)]))

一個(gè)入口所有航班wifi_count

def allofgate(N=1,gate_id='A01'): 
    #入口所有航班列表[]
    #allflights=[[sct,N],[sct,N],..,[sct,N]]
    fl_counN=fl_count[N]
    gatN=fl_counN[fl_counN['BGATE_ID']==gate_id].ix[:,['scheduled_flt','acutal_flt',0]].values
    n=len(gatN)
    all_=getpassCount(gatN[0])
    for i in xrange(1,n):
        try:
            all_=all_+getpassCount(gatN[i])
        except:
            print i
    return all_
fl_counNall=fl_count[0]
for i in range(1,len(fl_count)):
    try:
        tmp=fl_count[i]
        tmp['day']=[a[0:10] for a in fl_count[i]['scheduled_flt'].astype(str)]
        fl_counNall=fl_counNall.append(tmp)
    except:
        print i

fl_gb=fl_counNall.groupby(['BGATE_ID','day']).size().unstack().fillna(0)

fl_counNall.fillna(0,inplace=True)
gate_nan=fl_counNall[fl_counNall['BGATE_ID']==0]

#fl_counNall.groupby(['BGATE_ID','day']).size().unstack().fillna(0).head(15)

一個(gè)區(qū)域?qū)?yīng)的wifi點(diǎn)

def getwifiArea():
    wifiTag=wifi_day[1].columns
    wi_dic={}
    for a in wifiTag:
        b=a[:2]
        wi_dic[b]=wi_dic.get(b,[])+[a]
    return wi_dic
wi_dic=getwifiArea()
### 定義起飛前18個(gè)10分鐘內(nèi)數(shù)量參數(shù)
from sympy import *
parN=25# 定義的參數(shù)個(gè)數(shù)
def getpassAr():
    parabc=[]
    for i in range(parN):
        parabc.append('a_'+str(i))
    passAr=symbols(parabc)
    return passAr
parabc=getpassAr()
def getFactor(spy,parms=parabc):
    if type(spy)==int or type(spy)==float or type(spy)==str:
        return np.array([0]*parN)
    if spy.is_Add is False:
        tmp=[]
        for j in parms:
            tmp.append(int(spy.coeff(j)))
        return np.array(tmp)    
    args=spy.args
    num_list=np.array([0]*parN)
    for i in args:
        tmp=[]
        for j in parms:
            tmp.append(int(i.coeff(j)))
        num_list=num_list+np.array(tmp)        
    return np.array(num_list)
def factorMatrix(p1):
    tmp=[]
    for i in p1:
        try:
            tmp.append(getFactor(i))
        except:
            print i
    return np.array(tmp)   

線性回歸分析

前N天所有數(shù)據(jù)

x_list,y_list=[],[]
for n in range(1,15):
    x=factorMatrix(allofgate(N=n,gate_id='A101'))
    y=wifi_day[n].ix[:,'E1-3A<E1-3-01>'].values[:-1]
    x,y=x[:60],y[:60]        
    x_list,y_list=x_list+list(x),y_list+list(y)
from sklearn.linear_model import LinearRegression
rg=LinearRegression()
rg.fit(x_list,y_list)
pl.plot(rg.coef_)
最后編輯于
?著作權(quán)歸作者所有,轉(zhuǎn)載或內(nèi)容合作請(qǐng)聯(lián)系作者
  • 序言:七十年代末毙玻,一起剝皮案震驚了整個(gè)濱河市豌蟋,隨后出現(xiàn)的幾起案子,更是在濱河造成了極大的恐慌淆珊,老刑警劉巖夺饲,帶你破解...
    沈念sama閱讀 219,427評(píng)論 6 508
  • 序言:濱河連續(xù)發(fā)生了三起死亡事件,死亡現(xiàn)場(chǎng)離奇詭異施符,居然都是意外死亡往声,警方通過(guò)查閱死者的電腦和手機(jī),發(fā)現(xiàn)死者居然都...
    沈念sama閱讀 93,551評(píng)論 3 395
  • 文/潘曉璐 我一進(jìn)店門(mén)戳吝,熙熙樓的掌柜王于貴愁眉苦臉地迎上來(lái)浩销,“玉大人,你說(shuō)我怎么就攤上這事听哭÷螅” “怎么了塘雳?”我有些...
    開(kāi)封第一講書(shū)人閱讀 165,747評(píng)論 0 356
  • 文/不壞的土叔 我叫張陵,是天一觀的道長(zhǎng)普筹。 經(jīng)常有香客問(wèn)我败明,道長(zhǎng),這世上最難降的妖魔是什么太防? 我笑而不...
    開(kāi)封第一講書(shū)人閱讀 58,939評(píng)論 1 295
  • 正文 為了忘掉前任妻顶,我火速辦了婚禮,結(jié)果婚禮上蜒车,老公的妹妹穿的比我還像新娘讳嘱。我一直安慰自己,他們只是感情好酿愧,可當(dāng)我...
    茶點(diǎn)故事閱讀 67,955評(píng)論 6 392
  • 文/花漫 我一把揭開(kāi)白布沥潭。 她就那樣靜靜地躺著,像睡著了一般嬉挡。 火紅的嫁衣襯著肌膚如雪钝鸽。 梳的紋絲不亂的頭發(fā)上,一...
    開(kāi)封第一講書(shū)人閱讀 51,737評(píng)論 1 305
  • 那天棘伴,我揣著相機(jī)與錄音寞埠,去河邊找鬼。 笑死焊夸,一個(gè)胖子當(dāng)著我的面吹牛,可吹牛的內(nèi)容都是我干的蓝角。 我是一名探鬼主播阱穗,決...
    沈念sama閱讀 40,448評(píng)論 3 420
  • 文/蒼蘭香墨 我猛地睜開(kāi)眼,長(zhǎng)吁一口氣:“原來(lái)是場(chǎng)噩夢(mèng)啊……” “哼使鹅!你這毒婦竟也來(lái)了揪阶?” 一聲冷哼從身側(cè)響起,我...
    開(kāi)封第一講書(shū)人閱讀 39,352評(píng)論 0 276
  • 序言:老撾萬(wàn)榮一對(duì)情侶失蹤患朱,失蹤者是張志新(化名)和其女友劉穎鲁僚,沒(méi)想到半個(gè)月后,有當(dāng)?shù)厝嗽跇?shù)林里發(fā)現(xiàn)了一具尸體裁厅,經(jīng)...
    沈念sama閱讀 45,834評(píng)論 1 317
  • 正文 獨(dú)居荒郊野嶺守林人離奇死亡冰沙,尸身上長(zhǎng)有42處帶血的膿包…… 初始之章·張勛 以下內(nèi)容為張勛視角 年9月15日...
    茶點(diǎn)故事閱讀 37,992評(píng)論 3 338
  • 正文 我和宋清朗相戀三年,在試婚紗的時(shí)候發(fā)現(xiàn)自己被綠了执虹。 大學(xué)時(shí)的朋友給我發(fā)了我未婚夫和他白月光在一起吃飯的照片拓挥。...
    茶點(diǎn)故事閱讀 40,133評(píng)論 1 351
  • 序言:一個(gè)原本活蹦亂跳的男人離奇死亡,死狀恐怖袋励,靈堂內(nèi)的尸體忽然破棺而出侥啤,到底是詐尸還是另有隱情当叭,我是刑警寧澤,帶...
    沈念sama閱讀 35,815評(píng)論 5 346
  • 正文 年R本政府宣布盖灸,位于F島的核電站蚁鳖,受9級(jí)特大地震影響,放射性物質(zhì)發(fā)生泄漏赁炎。R本人自食惡果不足惜醉箕,卻給世界環(huán)境...
    茶點(diǎn)故事閱讀 41,477評(píng)論 3 331
  • 文/蒙蒙 一、第九天 我趴在偏房一處隱蔽的房頂上張望甘邀。 院中可真熱鬧琅攘,春花似錦、人聲如沸松邪。這莊子的主人今日做“春日...
    開(kāi)封第一講書(shū)人閱讀 32,022評(píng)論 0 22
  • 文/蒼蘭香墨 我抬頭看了看天上的太陽(yáng)逗抑。三九已至剧辐,卻和暖如春,著一層夾襖步出監(jiān)牢的瞬間邮府,已是汗流浹背荧关。 一陣腳步聲響...
    開(kāi)封第一講書(shū)人閱讀 33,147評(píng)論 1 272
  • 我被黑心中介騙來(lái)泰國(guó)打工, 沒(méi)想到剛下飛機(jī)就差點(diǎn)兒被人妖公主榨干…… 1. 我叫王不留褂傀,地道東北人忍啤。 一個(gè)月前我還...
    沈念sama閱讀 48,398評(píng)論 3 373
  • 正文 我出身青樓,卻偏偏與公主長(zhǎng)得像仙辟,于是被迫代替她去往敵國(guó)和親同波。 傳聞我的和親對(duì)象是個(gè)殘疾皇子,可洞房花燭夜當(dāng)晚...
    茶點(diǎn)故事閱讀 45,077評(píng)論 2 355

推薦閱讀更多精彩內(nèi)容