import pandas as pd
ebola = pd.read_csv('data/country_timeseries.csv', parse_dates=[0])
print(ebola.iloc[:5, :5])
'''
Date Day Cases_Guinea Cases_Liberia Cases_SierraLeone
0 2015-01-05 289 2776.0 NaN 10030.0
1 2015-01-04 288 2775.0 NaN 9780.0
2 2015-01-03 287 2769.0 8166.0 9722.0
3 2015-01-02 286 NaN 8157.0 NaN
4 2014-12-31 284 2730.0 8115.0 9633.0
'''
基于日期數(shù)據(jù)獲取子集
print(ebola.loc[(ebola.Date.dt.year == 2014) & (ebola.Date.dt.month == 6)].iloc[:,:5])
'''
Date Day Cases_Guinea Cases_Liberia Cases_SierraLeone
79 2014-06-30 100 413.0 107.0 239.0
80 2014-06-22 92 NaN 51.0 NaN
81 2014-06-20 90 390.0 NaN 158.0
82 2014-06-19 89 NaN 41.0 NaN
83 2014-06-18 88 390.0 NaN 136.0
84 2014-06-17 87 NaN NaN 97.0
85 2014-06-16 86 398.0 33.0 NaN
86 2014-06-10 80 351.0 13.0 89.0
87 2014-06-05 75 NaN 13.0 81.0
88 2014-06-03 73 344.0 13.0 NaN
89 2014-06-01 71 328.0 13.0 79.0
'''
DatetimeIndex 對(duì)象
處理包含datetime的數(shù)據(jù)時(shí)逸爵,經(jīng)常把datetime對(duì)象設(shè)置成DataFrame的索引
ebola.index = ebola['Date']
print(ebola.index)
'''
DatetimeIndex(['2015-01-05', '2015-01-04', '2015-01-03', '2015-01-02',
'2014-12-31', '2014-12-28', '2014-12-27', '2014-12-24',
'2014-12-21', '2014-12-20',
...
'2014-04-04', '2014-04-01', '2014-03-31', '2014-03-29',
'2014-03-28', '2014-03-27', '2014-03-26', '2014-03-25',
'2014-03-24', '2014-03-22'],
dtype='datetime64[ns]', name='Date', length=122, freq=None)
'''
# 指定年份抽取行數(shù)據(jù)
print(ebola['2015'].iloc[:,:5])
'''
Date Day Cases_Guinea Cases_Liberia Cases_SierraLeone
Date
2015-01-05 2015-01-05 289 2776.0 NaN 10030.0
2015-01-04 2015-01-04 288 2775.0 NaN 9780.0
2015-01-03 2015-01-03 287 2769.0 8166.0 9722.0
2015-01-02 2015-01-02 286 NaN 8157.0 NaN
'''
# 指定年份月份抽取數(shù)據(jù)
print(ebola['2014-06'].iloc[:,:5])
'''
Date Day Cases_Guinea Cases_Liberia Cases_SierraLeone
Date
2014-06-30 2014-06-30 100 413.0 107.0 239.0
2014-06-22 2014-06-22 92 NaN 51.0 NaN
2014-06-20 2014-06-20 90 390.0 NaN 158.0
2014-06-19 2014-06-19 89 NaN 41.0 NaN
2014-06-18 2014-06-18 88 390.0 NaN 136.0
2014-06-17 2014-06-17 87 NaN NaN 97.0
2014-06-16 2014-06-16 86 398.0 33.0 NaN
2014-06-10 2014-06-10 80 351.0 13.0 89.0
2014-06-05 2014-06-05 75 NaN 13.0 81.0
2014-06-03 2014-06-03 73 344.0 13.0 NaN
2014-06-01 2014-06-01 71 328.0 13.0 79.0
'''
TimedeltaIndex 對(duì)象
用日期運(yùn)算的結(jié)果作為index后荡陷,可以直接用TimedeltaIndex對(duì)象作為索引拍屑,但是必須要注意index順序,從上到下场刑。
ebola['outbreak_d'] = ebola['Date'] - ebola['Date'].min()
ebola.index = ebola['outbreak_d']
print(ebola.index)
'''
TimedeltaIndex(['289 days', '288 days', '287 days', '286 days', '284 days',
'281 days', '280 days', '277 days', '274 days', '273 days',
...
'13 days', '10 days', '9 days', '7 days', '6 days',
'5 days', '4 days', '3 days', '2 days', '0 days'],
dtype='timedelta64[ns]', name='outbreak_d', length=122, freq=None)
'''
print(ebola.iloc[:5, :5])
'''
Date Day Cases_Guinea Cases_Liberia Cases_SierraLeone
outbreak_d
289 days 2015-01-05 289 2776.0 NaN 10030.0
288 days 2015-01-04 288 2775.0 NaN 9780.0
287 days 2015-01-03 287 2769.0 8166.0 9722.0
286 days 2015-01-02 286 NaN 8157.0 NaN
284 days 2014-12-31 284 2730.0 8115.0 9633.0
'''
print(ebola['289 days': '280 days'].iloc[:, :5])
'''
Date Day Cases_Guinea Cases_Liberia Cases_SierraLeone
outbreak_d
289 days 2015-01-05 289 2776.0 NaN 10030.0
288 days 2015-01-04 288 2775.0 NaN 9780.0
287 days 2015-01-03 287 2769.0 8166.0 9722.0
286 days 2015-01-02 286 NaN 8157.0 NaN
284 days 2014-12-31 284 2730.0 8115.0 9633.0
281 days 2014-12-28 281 2706.0 8018.0 9446.0
'''
# 索引順序錯(cuò)誤
print(ebola['280 days': '289 days'].iloc[:, :5])
'''
Empty DataFrame
Columns: [Date, Day, Cases_Guinea, Cases_Liberia, Cases_SierraLeone]
Index: []
'''
日期范圍
2015-01-01和2014-03-23的數(shù)據(jù)是缺失的
ebola = pd.read_csv('data/country_timeseries.csv', parse_dates=[0])
print(ebola.iloc[:5, :5])
'''
Date Day Cases_Guinea Cases_Liberia Cases_SierraLeone
0 2015-01-05 289 2776.0 NaN 10030.0
1 2015-01-04 288 2775.0 NaN 9780.0
2 2015-01-03 287 2769.0 8166.0 9722.0
3 2015-01-02 286 NaN 8157.0 NaN
4 2014-12-31 284 2730.0 8115.0 9633.0
'''
print(ebola.iloc[-5:, :5])
'''
Date Day Cases_Guinea Cases_Liberia Cases_SierraLeone
117 2014-03-27 5 103.0 8.0 6.0
118 2014-03-26 4 86.0 NaN NaN
119 2014-03-25 3 86.0 NaN NaN
120 2014-03-24 2 86.0 NaN NaN
121 2014-03-22 0 49.0 NaN NaN
'''
創(chuàng)建一個(gè)日期范圍來(lái)為數(shù)據(jù)集重建索引
head_range = pd.date_range(start='2014-12-31', end='2015-01-05')
print(head_range)
'''
DatetimeIndex(['2014-12-31', '2015-01-01', '2015-01-02', '2015-01-03',
'2015-01-04', '2015-01-05'],
dtype='datetime64[ns]', freq='D')
'''
在這個(gè)例子中蔽豺,只取前5行數(shù)據(jù),想把head_range設(shè)置為ebola_5的索引,需要先把日期設(shè)置為ebola_5的索引棘利,然后為數(shù)據(jù)重建索引
ebola_5 = ebola.head()
ebola_5.index = ebola_5['Date']
ebola_5.reindex(head_range)
print(ebola_5.iloc[:, :5])
'''
Date Day Cases_Guinea Cases_Liberia Cases_SierraLeone
Date
2015-01-05 2015-01-05 289 2776.0 NaN 10030.0
2015-01-04 2015-01-04 288 2775.0 NaN 9780.0
2015-01-03 2015-01-03 287 2769.0 8166.0 9722.0
2015-01-02 2015-01-02 286 NaN 8157.0 NaN
2014-12-31 2014-12-31 284 2730.0 8115.0 9633.0
'''
頻率
在head_range函數(shù)中有一個(gè)參數(shù)freq,其默認(rèn)值為D(代表day)橱野,表示日期范圍內(nèi)的值是逐日遞增的。
print(pd.date_range('2017-01-01', '2017-01-07', freq='B'))
'''
DatetimeIndex(['2017-01-02', '2017-01-03', '2017-01-04', '2017-01-05',
'2017-01-06'],
dtype='datetime64[ns]', freq='B')
'''
偏移量
偏移量是在基本頻率上做的一點(diǎn)調(diào)整善玫,例如可以向剛剛創(chuàng)建的工作日范圍添加一個(gè)偏移量水援,這樣就可以隔一個(gè)工作日取一個(gè)工作日。
在基本頻率前加一個(gè)倍數(shù)值就創(chuàng)建出了該偏移量
# 從2017年1月1日這周隔一天取一個(gè)工作日
print(pd.date_range('2017-01-01', '2017-01-07', freq='2B'))
# DatetimeIndex(['2017-01-02', '2017-01-04', '2017-01-06'], dtype='datetime64[ns]', freq='2B')
偏移量可以和其他基本頻率結(jié)合使用
# 每月的第一個(gè)星期五
print(pd.date_range('2017-01-01', '2017-12-31', freq='WOM-1THU'))
'''
DatetimeIndex(['2017-01-05', '2017-02-02', '2017-03-02', '2017-04-06',
'2017-05-04', '2017-06-01', '2017-07-06', '2017-08-03',
'2017-09-07', '2017-10-05', '2017-11-02', '2017-12-07'],
dtype='datetime64[ns]', freq='WOM-1THU')
'''
移動(dòng)
有時(shí)需要更改數(shù)據(jù)的日期茅郎,例如修正數(shù)據(jù)中的某個(gè)測(cè)量誤差蜗元,或者對(duì)數(shù)據(jù)的開始日期進(jìn)行標(biāo)準(zhǔn)化,以便比較趨勢(shì)只洒。
比如需要比較不同國(guó)家的疫情傳播速度许帐,但是不同國(guó)家爆發(fā)疫情的時(shí)間不同,很難比較各國(guó)疫情的爆發(fā)情況劳坑。
ebola_sub = ebola[['Day', 'Cases_Guinea', 'Cases_Liberia']]
print(ebola_sub.tail(10))
'''
Day Cases_Guinea Cases_Liberia
112 13 143.0 18.0
113 10 127.0 8.0
114 9 122.0 8.0
115 7 112.0 7.0
116 6 112.0 3.0
117 5 103.0 8.0
118 4 86.0 NaN
119 3 86.0 NaN
120 2 86.0 NaN
121 0 49.0 NaN
'''
最好所有的日期都從常用的0天開始毕谴。
(1)由于有些日期沒(méi)有列出來(lái),所以需要為數(shù)據(jù)集的所有日期創(chuàng)建一個(gè)日期范圍距芬。
(2)需要計(jì)算數(shù)據(jù)集中最早日期和每列最早有效日期(非NaN)之間的插值涝开。
(3)然后根據(jù)計(jì)算結(jié)果移動(dòng)每列。
開始之前框仔,首先讀取ebola數(shù)據(jù)集的一個(gè)副本舀武。同時(shí)把Date解析為date對(duì)象,并把日期指派給index离斩。本例中會(huì)解析日期并直接設(shè)置為索引银舱。
ebola = pd.read_csv('data/country_timeseries.csv', index_col='Date', parse_dates=['Date'])
print(ebola.head().iloc[:, :4])
'''
Day Cases_Guinea Cases_Liberia Cases_SierraLeone
Date
2015-01-05 289 2776.0 NaN 10030.0
2015-01-04 288 2775.0 NaN 9780.0
2015-01-03 287 2769.0 8166.0 9722.0
2015-01-02 286 NaN 8157.0 NaN
2014-12-31 284 2730.0 8115.0 9633.0
'''
new_idx = pd.date_range(ebola.index.min(), ebola.index.max())
print(new_idx)
'''
DatetimeIndex(['2014-03-22', '2014-03-23', '2014-03-24', '2014-03-25',
'2014-03-26', '2014-03-27', '2014-03-28', '2014-03-29',
'2014-03-30', '2014-03-31',
...
'2014-12-27', '2014-12-28', '2014-12-29', '2014-12-30',
'2014-12-31', '2015-01-01', '2015-01-02', '2015-01-03',
'2015-01-04', '2015-01-05'],
dtype='datetime64[ns]', length=290, freq='D')
'''
new_idx = reversed(new_idx)
ebola = ebola.reindex(new_idx)
print(ebola.head().iloc[:, :4])
'''
Day Cases_Guinea Cases_Liberia Cases_SierraLeone
Date
2015-01-05 289.0 2776.0 NaN 10030.0
2015-01-04 288.0 2775.0 NaN 9780.0
2015-01-03 287.0 2769.0 8166.0 9722.0
2015-01-02 286.0 NaN 8157.0 NaN
2015-01-01 NaN NaN NaN NaN
'''
# 每列最早有效日期,Series的last_valid_index方法返回最后一個(gè)非缺失值或非空值的索引值
# 類似的還有first_valid_index(返回第一個(gè)非缺失值或非空值的索引值)
last_valid = ebola.apply(pd.Series.last_valid_index)
print(last_valid)
'''
Day 2014-03-22
Cases_Guinea 2014-03-22
Cases_Liberia 2014-03-27
Cases_SierraLeone 2014-03-27
Cases_Nigeria 2014-07-23
Cases_Senegal 2014-08-31
Cases_UnitedStates 2014-10-01
Cases_Spain 2014-10-08
Cases_Mali 2014-10-22
Deaths_Guinea 2014-03-22
Deaths_Liberia 2014-03-27
Deaths_SierraLeone 2014-03-27
Deaths_Nigeria 2014-07-23
Deaths_Senegal 2014-09-07
Deaths_UnitedStates 2014-10-01
Deaths_Spain 2014-10-08
Deaths_Mali 2014-10-22
dtype: datetime64[ns]
'''
# 獲取數(shù)據(jù)中最早的日期
earliest_date = ebola.index.min()
print(earliest_date)
# 2014-03-22 00:00:00
# 計(jì)算最早日期和每列最早有效期日的差值
shift_values = last_valid - earliest_date
print(shift_values)
'''
Day 0 days
Cases_Guinea 0 days
Cases_Liberia 5 days
Cases_SierraLeone 5 days
Cases_Nigeria 123 days
Cases_Senegal 162 days
Cases_UnitedStates 193 days
Cases_Spain 200 days
Cases_Mali 214 days
Deaths_Guinea 0 days
Deaths_Liberia 5 days
Deaths_SierraLeone 5 days
Deaths_Nigeria 123 days
Deaths_Senegal 169 days
Deaths_UnitedStates 193 days
Deaths_Spain 200 days
Deaths_Mali 214 days
dtype: timedelta64[ns]
'''
# 歷遍各樂(lè)趣跛梗,根據(jù)shift_values中相應(yīng)的值使用shift方法把列下移寻馏。(shift_values中的數(shù)字都是正數(shù),若是負(fù)數(shù)核偿,會(huì)把值上移)
ebola_dict = {}
for idx, col in enumerate(ebola):
d = shift_values[idx].days
shifted = ebola[col].shift(d)
ebola_dict[col] = shifted
ebola_shift = pd.DataFrame(ebola_dict)
# dict是無(wú)序的诚欠,傳入原來(lái)的ebola的列來(lái)重新排列
ebola_shift = ebola_shift[ebola.columns]
# 每列的最后一行都有值
print(ebola_shift.tail())
'''
Day Cases_Guinea Cases_Liberia Cases_SierraLeone \
Date
2014-03-26 4.0 86.0 8.0 2.0
2014-03-25 3.0 86.0 NaN NaN
2014-03-24 2.0 86.0 7.0 NaN
2014-03-23 NaN NaN 3.0 2.0
2014-03-22 0.0 49.0 8.0 6.0
Cases_Nigeria Cases_Senegal Cases_UnitedStates Cases_Spain \
Date
2014-03-26 1.0 NaN 1.0 1.0
2014-03-25 NaN NaN NaN NaN
2014-03-24 NaN NaN NaN NaN
2014-03-23 NaN NaN NaN NaN
2014-03-22 0.0 1.0 1.0 1.0
Cases_Mali Deaths_Guinea Deaths_Liberia Deaths_SierraLeone \
Date
2014-03-26 NaN 62.0 4.0 2.0
2014-03-25 NaN 60.0 NaN NaN
2014-03-24 NaN 59.0 2.0 NaN
2014-03-23 NaN NaN 3.0 2.0
2014-03-22 1.0 29.0 6.0 5.0
Deaths_Nigeria Deaths_Senegal Deaths_UnitedStates Deaths_Spain \
Date
2014-03-26 1.0 NaN 0.0 1.0
2014-03-25 NaN NaN NaN NaN
2014-03-24 NaN NaN NaN NaN
2014-03-23 NaN NaN NaN NaN
2014-03-22 0.0 0.0 0.0 1.0
Deaths_Mali
Date
2014-03-26 NaN
2014-03-25 NaN
2014-03-24 NaN
2014-03-23 NaN
2014-03-22 1.0
'''
每一行的索引已經(jīng)失效,可以將其刪除,然后指定正確的列轰绵,即日期粉寞。Day不再表示日期爆發(fā)的第一天,而是指特定國(guó)家疫情爆發(fā)的第一天
ebola_shift.index = ebola_shift['Day']
ebola_shift = ebola_shift.drop(['Day'], axis=1)
print(ebola_shift.tail())
'''
Cases_Guinea Cases_Liberia Cases_SierraLeone Cases_Nigeria \
Day
4.0 86.0 8.0 2.0 1.0
3.0 86.0 NaN NaN NaN
2.0 86.0 7.0 NaN NaN
NaN NaN 3.0 2.0 NaN
0.0 49.0 8.0 6.0 0.0
Cases_Senegal Cases_UnitedStates Cases_Spain Cases_Mali \
Day
4.0 NaN 1.0 1.0 NaN
3.0 NaN NaN NaN NaN
2.0 NaN NaN NaN NaN
NaN NaN NaN NaN NaN
0.0 1.0 1.0 1.0 1.0
Deaths_Guinea Deaths_Liberia Deaths_SierraLeone Deaths_Nigeria \
Day
4.0 62.0 4.0 2.0 1.0
3.0 60.0 NaN NaN NaN
2.0 59.0 2.0 NaN NaN
NaN NaN 3.0 2.0 NaN
0.0 29.0 6.0 5.0 0.0
Deaths_Senegal Deaths_UnitedStates Deaths_Spain Deaths_Mali
Day
4.0 NaN 0.0 1.0 NaN
3.0 NaN NaN NaN NaN
2.0 NaN NaN NaN NaN
NaN NaN NaN NaN NaN
0.0 0.0 0.0 1.0 1.0
'''
重采樣
- 下采樣:從高頻率到低頻率(比如從每天到每月)
- 上采樣:從低頻率到高頻率(比如從每月到每天)
- 原樣采樣:采樣頻率不變(比如每月的第一個(gè)星期四到每月的最后一個(gè)星期五)
resample函數(shù)有一個(gè)rule參數(shù)左腔,用于接收偏移量字符串唧垦。
# 下采樣:從每天到每月
# 這里有多個(gè)值,需要把結(jié)果居合起來(lái)
down = ebola.resample('M').mean()
print(down.iloc[:5, :5])
'''
Day Cases_Guinea Cases_Liberia Cases_SierraLeone \
Date
2014-03-31 4.500000 94.500000 6.500000 3.333333
2014-04-30 24.333333 177.818182 24.555556 2.200000
2014-05-31 51.888889 248.777778 12.555556 7.333333
2014-06-30 84.636364 373.428571 35.500000 125.571429
2014-07-31 115.700000 423.000000 212.300000 420.500000
Cases_Nigeria
Date
2014-03-31 NaN
2014-04-30 NaN
2014-05-31 NaN
2014-06-30 NaN
2014-07-31 1.333333
'''
# 這里對(duì)下采樣得到的值進(jìn)行上采樣
# 請(qǐng)注意填充了多少確實(shí)日期
# 使用缺失值進(jìn)行填充
up = down.resample('D').mean()
print(up.iloc[:5, :5])
'''
Day Cases_Guinea Cases_Liberia Cases_SierraLeone Cases_Nigeria
Date
2014-03-31 4.5 94.5 6.5 3.333333 NaN
2014-04-01 NaN NaN NaN NaN NaN
2014-04-02 NaN NaN NaN NaN NaN
2014-04-03 NaN NaN NaN NaN NaN
2014-04-04 NaN NaN NaN NaN NaN
'''
時(shí)區(qū)
import pytz
import re
# 在pandas中處理時(shí)區(qū)液样,最簡(jiǎn)單的方法是使用pytz.all_timezones給出的字符串名
regex = re.compile(r'^US')
selected_files = filter(regex.search, pytz.common_timezones)
print(list(selected_files))
# ['US/Alaska', 'US/Arizona', 'US/Central', 'US/Eastern', 'US/Hawaii', 'US/Mountain', 'US/Pacific']
# 指定時(shí)區(qū)
depart = pd.Timestamp('2017-08-29 07:00', tz='US/Eastern')
print(depart)
# 2017-08-29 07:00:00-04:00
# 對(duì)時(shí)區(qū)編碼的另一種方法是調(diào)用‘空’時(shí)間戳的tz_localize方法
arrive = pd.Timestamp('2017-08-29 09:57')
print(arrive)
arrive = arrive.tz_localize('US/Pacific')
print(arrive)
# 2017-08-29 09:57:00-07:00
# 把航班到達(dá)時(shí)間轉(zhuǎn)換回東部時(shí)區(qū)
arrive = arrive.tz_convert('US/Eastern')
print(arrive)
# 2017-08-29 12:57:00-04:00
# 對(duì)兩個(gè)時(shí)間點(diǎn)計(jì)算時(shí)間差业崖,之前的版本需要調(diào)整成同一個(gè)時(shí)區(qū)才可計(jì)算,現(xiàn)在不需要
# duration = arrive.tz_convert('US/Eastern') - depart
duration = arrive - depart
print(duration)
# 0 days 05:57:00