pandas的向量化字符串和時(shí)間序列處理参萄。
代碼
import numpy as np
import pandas as pd
import time
from datetime import datetime
import matplotlib as mpl
import matplotlib.pyplot as plt
from dateutil import parser
from pandas.tseries.offsets import BDay
# plt.style.use('classic')
plt.style.use('seaborn-whitegrid')
np.random.seed(0)
# 配置pandas顯示
pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', 10)
# 向量化字符串操作(vectorized string operation)
data = ['peter', 'Paul', None, 'MARY', 'gUIDO']
# print([s.capitalize() for s in data])
names = pd.Series(data)
print(names)
print(names.str.capitalize())
monte = pd.Series(['Graham Chapman', 'John Cleese', 'Terry Gilliam', 'Eric Idle',
'Terry Jones', 'Michael Palin'])
print(monte.str.lower())
print(monte.str.len())
print(monte.str.startswith('T'))
print(monte.str.split())
print(monte.str.extract('([A-Za-z]+)'))
print(monte.str.findall(r'^[^AEIOU].*[^aeiou]$'))
print(monte.str[0:3])
print(monte.str.split().str.get(-1))
full_monte = pd.DataFrame({'name': monte,
'info': ['B|C|D', 'B|D', 'A|C', 'B|D', 'B|C',
'B|C|D']})
print(full_monte)
print(full_monte['info'].str.get_dummies('|'))
# 處理時(shí)間序列
print(datetime(year=2015, month=7, day=4))
date = parser.parse("4th of July, 2015")
print(date)
print(date.strftime('%A'))
date = np.array('2015-07-04', dtype=np.datetime64)
print(date)
print(date + np.arange(12))
print(np.datetime64('2015-07-04 12:00'))
date = pd.to_datetime('4th of July, 2015')
print(date)
print(date.strftime('%A'))
print(date + pd.to_timedelta(np.arange(12), 'D'))
index = pd.DatetimeIndex(['2014-07-04', '2014-08-04',
'2015-07-04', '2015-08-04'])
data = pd.Series([0, 1, 2, 3], index=index)
print(data)
print(data['2014-07-04': '2015-07-04'])
print(data['2015'])
# ? 針對時(shí)間戳數(shù)據(jù)翘魄,Pandas 提供了Timestamp 類型典蜕。與前面介紹的一樣笤喳,它本質(zhì)上是 Python 的原生 datetime 類型的替代品间校,
# 但是在性能更好的 numpy.datetime64 類型的基 礎(chǔ)上創(chuàng)建勺疼。對應(yīng)的索引數(shù)據(jù)結(jié)構(gòu)是 DatetimeIndex教寂。
# ? 針對時(shí)間周期數(shù)據(jù),Pandas 提供了 Period 類型执庐。這是利用 numpy.datetime64 類型將固 定頻率的時(shí)間間隔進(jìn)行編碼酪耕。
# 對應(yīng)的索引數(shù)據(jù)結(jié)構(gòu)是 PeriodIndex。
# ? 針對時(shí)間增量或持續(xù)時(shí)間轨淌,Pandas 提供了 Timedelta 類型迂烁。Timedelta 是一種代替 Python 原生 datetime.timedelta 類型的高性能數(shù)據(jù)結(jié)構(gòu),
# 同樣是基于 numpy.timedelta64 類型递鹉。 對應(yīng)的索引數(shù)據(jù)結(jié)構(gòu)是 TimedeltaIndex盟步。
dates = pd.to_datetime([datetime(2015, 7, 3), '4th of July, 2015',
'2015-Jul-6', '07-07-2015', '20150708'])
print(dates)
print(dates.to_period('D'))
print(dates - dates[0])
# pd.date_range()可以處理時(shí)間戳、pd.period_range()可以處理周期躏结、pd.timedelta_range()可以處理時(shí)間間隔却盘。
# 我們已經(jīng)介紹過,Python的range() 和 NumPy的np.arange()可以用起點(diǎn)媳拴、終點(diǎn)和步長(可選的)創(chuàng)建一個(gè)序列黄橘。
print(pd.date_range('2015-07-03', '2015-07-10'))
print(pd.date_range('2015-07-03', periods=8))
print(pd.date_range('2015-07-03', periods=8, freq='H'))
print(pd.period_range('2015-07', periods=8, freq='M'))
print(pd.timedelta_range(0, periods=10, freq='H'))
print(pd.timedelta_range(0, periods=9, freq='2H30T'))
print(pd.date_range('2015-07-01', periods=5, freq=BDay()))
# 處理時(shí)間序列數(shù)據(jù)時(shí),經(jīng)常需要按照新的頻率(更高頻率禀挫、更低頻率)對數(shù)據(jù)進(jìn)行重新取樣旬陡。
# 你可以通過resample()方法解決這個(gè)問題,或者用更簡單的asfreq()方法语婴。
# 這兩個(gè)方法的主要差異在于描孟,resample()方法是以數(shù)據(jù)累計(jì)(data aggregation)為基礎(chǔ),
# 而 asfreq()方法是以數(shù)據(jù)選擇(data selection)為基礎(chǔ)砰左。
輸出
0 peter
1 Paul
2 None
3 MARY
4 gUIDO
dtype: object
0 Peter
1 Paul
2 None
3 Mary
4 Guido
dtype: object
0 graham chapman
1 john cleese
2 terry gilliam
3 eric idle
4 terry jones
5 michael palin
dtype: object
0 14
1 11
2 13
3 9
4 11
5 13
dtype: int64
0 False
1 False
2 True
3 False
4 True
5 False
dtype: bool
0 [Graham, Chapman]
1 [John, Cleese]
2 [Terry, Gilliam]
3 [Eric, Idle]
4 [Terry, Jones]
5 [Michael, Palin]
dtype: object
0
0 Graham
1 John
2 Terry
3 Eric
4 Terry
5 Michael
0 [Graham Chapman]
1 []
2 [Terry Gilliam]
3 []
4 [Terry Jones]
5 [Michael Palin]
dtype: object
0 Gra
1 Joh
2 Ter
3 Eri
4 Ter
5 Mic
dtype: object
0 Chapman
1 Cleese
2 Gilliam
3 Idle
4 Jones
5 Palin
dtype: object
name info
0 Graham Chapman B|C|D
1 John Cleese B|D
2 Terry Gilliam A|C
3 Eric Idle B|D
4 Terry Jones B|C
5 Michael Palin B|C|D
A B C D
0 0 1 1 1
1 0 1 0 1
2 1 0 1 0
3 0 1 0 1
4 0 1 1 0
5 0 1 1 1
2015-07-04 00:00:00
2015-07-04 00:00:00
Saturday
2015-07-04
['2015-07-04' '2015-07-05' '2015-07-06' '2015-07-07' '2015-07-08'
'2015-07-09' '2015-07-10' '2015-07-11' '2015-07-12' '2015-07-13'
'2015-07-14' '2015-07-15']
2015-07-04T12:00
2015-07-04 00:00:00
Saturday
DatetimeIndex(['2015-07-04', '2015-07-05', '2015-07-06', '2015-07-07',
'2015-07-08', '2015-07-09', '2015-07-10', '2015-07-11',
'2015-07-12', '2015-07-13', '2015-07-14', '2015-07-15'],
dtype='datetime64[ns]', freq=None)
2014-07-04 0
2014-08-04 1
2015-07-04 2
2015-08-04 3
dtype: int64
2014-07-04 0
2014-08-04 1
2015-07-04 2
dtype: int64
2015-07-04 2
2015-08-04 3
dtype: int64
DatetimeIndex(['2015-07-03', '2015-07-04', '2015-07-06', '2015-07-07',
'2015-07-08'],
dtype='datetime64[ns]', freq=None)
PeriodIndex(['2015-07-03', '2015-07-04', '2015-07-06', '2015-07-07',
'2015-07-08'],
dtype='period[D]', freq='D')
TimedeltaIndex(['0 days', '1 days', '3 days', '4 days', '5 days'], dtype='timedelta64[ns]', freq=None)
DatetimeIndex(['2015-07-03', '2015-07-04', '2015-07-05', '2015-07-06',
'2015-07-07', '2015-07-08', '2015-07-09', '2015-07-10'],
dtype='datetime64[ns]', freq='D')
DatetimeIndex(['2015-07-03', '2015-07-04', '2015-07-05', '2015-07-06',
'2015-07-07', '2015-07-08', '2015-07-09', '2015-07-10'],
dtype='datetime64[ns]', freq='D')
DatetimeIndex(['2015-07-03 00:00:00', '2015-07-03 01:00:00',
'2015-07-03 02:00:00', '2015-07-03 03:00:00',
'2015-07-03 04:00:00', '2015-07-03 05:00:00',
'2015-07-03 06:00:00', '2015-07-03 07:00:00'],
dtype='datetime64[ns]', freq='H')
PeriodIndex(['2015-07', '2015-08', '2015-09', '2015-10', '2015-11', '2015-12',
'2016-01', '2016-02'],
dtype='period[M]', freq='M')
TimedeltaIndex(['0 days 00:00:00', '0 days 01:00:00', '0 days 02:00:00',
'0 days 03:00:00', '0 days 04:00:00', '0 days 05:00:00',
'0 days 06:00:00', '0 days 07:00:00', '0 days 08:00:00',
'0 days 09:00:00'],
dtype='timedelta64[ns]', freq='H')
TimedeltaIndex(['0 days 00:00:00', '0 days 02:30:00', '0 days 05:00:00',
'0 days 07:30:00', '0 days 10:00:00', '0 days 12:30:00',
'0 days 15:00:00', '0 days 17:30:00', '0 days 20:00:00'],
dtype='timedelta64[ns]', freq='150T')
DatetimeIndex(['2015-07-01', '2015-07-02', '2015-07-03', '2015-07-06',
'2015-07-07'],
dtype='datetime64[ns]', freq='B')