第01章 Pandas基礎
第02章 DataFrame運算
第03章 數據分析入門
第04章 選取數據子集
第05章 布爾索引
第06章 索引對齊
第07章 分組聚合杆逗、過濾、轉換
第08章 數據清理
第09章 合并Pandas對象
第10章 時間序列分析
第11章 用Matplotlib鳞疲、Pandas罪郊、Seaborn進行可視化
In[1]: import pandas as pd
import numpy as np
1. 定義聚合
# 讀取flights數據集,查詢頭部
In[2]: flights = pd.read_csv('data/flights.csv')
flights.head()
Out[2]:
# 按照AIRLINE分組尚洽,使用agg方法悔橄,傳入要聚合的列和聚合函數
In[3]: flights.groupby('AIRLINE').agg({'ARR_DELAY':'mean'}).head()
Out[3]:
# 或者要選取的列使用索引,聚合函數作為字符串傳入agg
In[4]: flights.groupby('AIRLINE')['ARR_DELAY'].agg('mean').head()
Out[4]:
AIRLINE
AA 5.542661
AS -0.833333
B6 8.692593
DL 0.339691
EV 7.034580
Name: ARR_DELAY, dtype: float64
# 也可以向agg中傳入NumPy的mean函數
In[5]: flights.groupby('AIRLINE')['ARR_DELAY'].agg(np.mean).head()
Out[5]:
# 也可以直接使用mean()函數
In[6]: flights.groupby('AIRLINE')['ARR_DELAY'].mean().head()
Out[6]:
原理
# groupby方法產生的是一個DataFrameGroupBy對象
In[7]: grouped = flights.groupby('AIRLINE')
type(grouped)
Out[7]: pandas.core.groupby.DataFrameGroupBy
更多
# 如果agg接收的不是聚合函數腺毫,則會導致異常
In[8]: flights.groupby('AIRLINE')['ARR_DELAY'].agg(np.sqrt)
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/groupby.py:842: RuntimeWarning: invalid value encountered in sqrt
f = lambda x: func(x, *args, **kwargs)
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/groupby.py:3015: RuntimeWarning: invalid value encountered in sqrt
output = func(group, *args, **kwargs)
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/groupby.py in agg_series(self, obj, func)
2177 try:
-> 2178 return self._aggregate_series_fast(obj, func)
2179 except Exception:
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/groupby.py in _aggregate_series_fast(self, obj, func)
2197 dummy)
-> 2198 result, counts = grouper.get_result()
2199 return result, counts
pandas/_libs/src/reduce.pyx in pandas._libs.lib.SeriesGrouper.get_result (pandas/_libs/lib.c:39105)()
pandas/_libs/src/reduce.pyx in pandas._libs.lib.SeriesGrouper.get_result (pandas/_libs/lib.c:38973)()
pandas/_libs/src/reduce.pyx in pandas._libs.lib._get_result_array (pandas/_libs/lib.c:32039)()
ValueError: function does not reduce
During handling of the above exception, another exception occurred:
ValueError Traceback (most recent call last)
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/groupby.py in aggregate(self, func_or_funcs, *args, **kwargs)
2882 try:
-> 2883 return self._python_agg_general(func_or_funcs, *args, **kwargs)
2884 except Exception:
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/groupby.py in _python_agg_general(self, func, *args, **kwargs)
847 try:
--> 848 result, counts = self.grouper.agg_series(obj, f)
849 output[name] = self._try_cast(result, obj, numeric_only=True)
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/groupby.py in agg_series(self, obj, func)
2179 except Exception:
-> 2180 return self._aggregate_series_pure_python(obj, func)
2181
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/groupby.py in _aggregate_series_pure_python(self, obj, func)
2214 isinstance(res, list)):
-> 2215 raise ValueError('Function does not reduce')
2216 result = np.empty(ngroups, dtype='O')
ValueError: Function does not reduce
During handling of the above exception, another exception occurred:
Exception Traceback (most recent call last)
<ipython-input-8-2bcc9ccfec77> in <module>()
----> 1 flights.groupby('AIRLINE')['ARR_DELAY'].agg(np.sqrt)
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/groupby.py in aggregate(self, func_or_funcs, *args, **kwargs)
2883 return self._python_agg_general(func_or_funcs, *args, **kwargs)
2884 except Exception:
-> 2885 result = self._aggregate_named(func_or_funcs, *args, **kwargs)
2886
2887 index = Index(sorted(result), name=self.grouper.names[0])
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/groupby.py in _aggregate_named(self, func, *args, **kwargs)
3015 output = func(group, *args, **kwargs)
3016 if isinstance(output, (Series, Index, np.ndarray)):
-> 3017 raise Exception('Must produce aggregated value')
3018 result[name] = self._try_cast(output, group)
3019
Exception: Must produce aggregated value
2. 用多個列和函數進行分組和聚合
# 導入數據
In[9]: flights = pd.read_csv('data/flights.csv')
flights.head()
Out[9]:
# 每家航空公司每周平均每天取消的航班數
In[10]: flights.groupby(['AIRLINE', 'WEEKDAY'])['CANCELLED'].agg('sum').head(7)
Out[10]: AIRLINE WEEKDAY
AA 1 41
2 9
3 16
4 20
5 18
6 21
7 29
Name: CANCELLED, dtype: int64
# 分組可以是多組癣疟,選取可以是多組,聚合函數也可以是多個
# 每周每家航空公司取消或改變航線的航班總數和比例
In[11]: flights.groupby(['AIRLINE', 'WEEKDAY'])['CANCELLED', 'DIVERTED'].agg(['sum', 'mean']).head(7)
Out[11]:
# 用列表和嵌套字典對多列分組和聚合
# 對于每條航線潮酒,找到總航班數睛挚,取消的數量和比例,飛行時間的平均時間和方差
In[12]: group_cols = ['ORG_AIR', 'DEST_AIR']
agg_dict = {'CANCELLED':['sum', 'mean', 'size'],
'AIR_TIME':['mean', 'var']}
flights.groupby(group_cols).agg(agg_dict).head()
# flights.groupby(['ORG_AIR', 'DEST_AIR']).agg({'CANCELLED': ['sum', 'mean', 'size'],
# 'AIR_TIME':['mean', 'var']}).head()
Out[12]:
3. 分組后去除多級索引
# 讀取數據
In[13]: flights = pd.read_csv('data/flights.csv')
flights.head()
Out[13]:
# 按'AIRLINE', 'WEEKDAY'分組澈灼,分別對DIST和ARR_DELAY聚合
In[14]: airline_info = flights.groupby(['AIRLINE', 'WEEKDAY'])\
.agg({'DIST':['sum', 'mean'],
'ARR_DELAY':['min', 'max']}).astype(int)
airline_info.head()
Out[14]:
# 行和列都有兩級索引竞川,get_level_values(0)取出第一級索引
In[15]: level0 = airline_info.columns.get_level_values(0)
level0
Out[15]: Index(['DIST', 'DIST', 'ARR_DELAY', 'ARR_DELAY'], dtype='object')
# get_level_values(1)取出第二級索引
In[16]: level1 = airline_info.columns.get_level_values(1)
level1
Out[16]: Index(['sum', 'mean', 'min', 'max'], dtype='object')
# 一級和二級索引拼接成新的列索引
In[17]: airline_info.columns = level0 + '_' + level1
In[18]: airline_info.head(7)
Out[18]:
# reset_index()可以將行索引變成單級
In[19]: airline_info.reset_index().head(7)
Out[19]:
更多
# Pandas默認會在分組運算后店溢,將所有分組的列放在索引中叁熔,as_index設為False可以避免這么做。分組后使用reset_index床牧,也可以達到同樣的效果
In[20]: flights.groupby(['AIRLINE'], as_index=False)['DIST'].agg('mean').round(0)
Out[20]:
# 上面這么做荣回,會默認對AIRLINE排序,sort設為False可以避免排序
In[21]: flights.groupby(['AIRLINE'], as_index=False, sort=False)['DIST'].agg('mean')
Out[21]:
4. 自定義聚合函數
In[22]: college = pd.read_csv('data/college.csv')
college.head()
Out[22]:
# 求出每個州的本科生的平均值和標準差
In[23]: college.groupby('STABBR')['UGDS'].agg(['mean', 'std']).round(0).head()
Out[23]:
# 遠離平均值的標準差的最大個數戈咳,寫一個自定義函數
In[24]: def max_deviation(s):
std_score = (s - s.mean()) / s.std()
return std_score.abs().max()
# agg聚合函數在調用方法時心软,直接引入自定義的函數名
In[25]: college.groupby('STABBR')['UGDS'].agg(max_deviation).round(1).head()
Out[25]: STABBR
AK 2.6
AL 5.8
AR 6.3
AS NaN
AZ 9.9
Name: UGDS, dtype: float64
更多
# 自定義的聚合函數也適用于多個數值列
In[26]: college.groupby('STABBR')['UGDS', 'SATVRMID', 'SATMTMID'].agg(max_deviation).round(1).head()
Out[26]:
# 自定義聚合函數也可以和預先定義的函數一起使用
In[27]: college.groupby(['STABBR', 'RELAFFIL'])['UGDS', 'SATVRMID', 'SATMTMID']\
.agg([max_deviation, 'mean', 'std']).round(1).head()
Out[27]:
# Pandas使用函數名作為返回列的名字;你可以直接使用rename方法修改著蛙,或通過__name__屬性修改
In[28]: max_deviation.__name__
Out[28]: 'max_deviation'
In[29]: max_deviation.__name__ = 'Max Deviation'
In[30]: college.groupby(['STABBR', 'RELAFFIL'])['UGDS', 'SATVRMID', 'SATMTMID']\
.agg([max_deviation, 'mean', 'std']).round(1).head()
Out[30]:
5. 用 *args 和 **kwargs 自定義聚合函數
# 用inspect模塊查看groupby對象的agg方法的簽名
In[31]: college = pd.read_csv('data/college.csv')
grouped = college.groupby(['STABBR', 'RELAFFIL'])
In[32]: import inspect
inspect.signature(grouped.agg)
Out[32]: <Signature (arg, *args, **kwargs)>
如何做
# 自定義一個返回去本科生人數在1000和3000之間的比例的函數
In[33]: def pct_between_1_3k(s):
return s.between(1000, 3000).mean()
# 用州和宗教分組删铃,再聚合
In[34]: college.groupby(['STABBR', 'RELAFFIL'])['UGDS'].agg(pct_between_1_3k).head(9)
Out[34]:
STABBR RELAFFIL
AK 0 0.142857
1 0.000000
AL 0 0.236111
1 0.333333
AR 0 0.279412
1 0.111111
AS 0 1.000000
AZ 0 0.096774
1 0.000000
Name: UGDS, dtype: float64
# 但是這個函數不能讓用戶自定義上下限,再新寫一個函數
In[35]: def pct_between(s, low, high):
return s.between(low, high).mean()
# 使用這個自定義聚合函數踏堡,并傳入最大和最小值
In[36]: college.groupby(['STABBR', 'RELAFFIL'])['UGDS'].agg(pct_between, 1000, 10000).head(9)
Out[36]:
STABBR RELAFFIL
AK 0 0.428571
1 0.000000
AL 0 0.458333
1 0.375000
AR 0 0.397059
1 0.166667
AS 0 1.000000
AZ 0 0.233871
1 0.111111
Name: UGDS, dtype: float64
原理
# 顯示指定最大和最小值
In[37]: college.groupby(['STABBR', 'RELAFFIL'])['UGDS'].agg(pct_between, high=10000, low=1000).head(9)
Out[37]:
STABBR RELAFFIL
AK 0 0.428571
1 0.000000
AL 0 0.458333
1 0.375000
AR 0 0.397059
1 0.166667
AS 0 1.000000
AZ 0 0.233871
1 0.111111
Name: UGDS, dtype: float64
# 也可以關鍵字參數和非關鍵字參數混合使用猎唁,只要非關鍵字參數在后面
In[38]: college.groupby(['STABBR', 'RELAFFIL'])['UGDS'].agg(pct_between, 1000, high=10000).head(9)
Out[38]:
STABBR RELAFFIL
AK 0 0.428571
1 0.000000
AL 0 0.458333
1 0.375000
AR 0 0.397059
1 0.166667
AS 0 1.000000
AZ 0 0.233871
1 0.111111
Name: UGDS, dtype: float64
更多
# Pandas不支持多重聚合時,使用參數
In[39]: college.groupby(['STABBR', 'RELAFFIL'])['UGDS'].agg(['mean', pct_between], low=100, high=1000)
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-39-3e3e18919cf9> in <module>()
----> 1 college.groupby(['STABBR', 'RELAFFIL'])['UGDS'].agg(['mean', pct_between], low=100, high=1000)
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/groupby.py in aggregate(self, func_or_funcs, *args, **kwargs)
2871 if hasattr(func_or_funcs, '__iter__'):
2872 ret = self._aggregate_multiple_funcs(func_or_funcs,
-> 2873 (_level or 0) + 1)
2874 else:
2875 cyfunc = self._is_cython_func(func_or_funcs)
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/groupby.py in _aggregate_multiple_funcs(self, arg, _level)
2944 obj._reset_cache()
2945 obj._selection = name
-> 2946 results[name] = obj.aggregate(func)
2947
2948 if isinstance(list(compat.itervalues(results))[0],
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/groupby.py in aggregate(self, func_or_funcs, *args, **kwargs)
2878
2879 if self.grouper.nkeys > 1:
-> 2880 return self._python_agg_general(func_or_funcs, *args, **kwargs)
2881
2882 try:
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/groupby.py in _python_agg_general(self, func, *args, **kwargs)
852
853 if len(output) == 0:
--> 854 return self._python_apply_general(f)
855
856 if self.grouper._filter_empty_groups:
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/groupby.py in _python_apply_general(self, f)
718 def _python_apply_general(self, f):
719 keys, values, mutated = self.grouper.apply(f, self._selected_obj,
--> 720 self.axis)
721
722 return self._wrap_applied_output(
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/groupby.py in apply(self, f, data, axis)
1800 # group might be modified
1801 group_axes = _get_axes(group)
-> 1802 res = f(group)
1803 if not _is_indexed_like(res, group_axes):
1804 mutated = True
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/groupby.py in <lambda>(x)
840 def _python_agg_general(self, func, *args, **kwargs):
841 func = self._is_builtin_func(func)
--> 842 f = lambda x: func(x, *args, **kwargs)
843
844 # iterate through "columns" ex exclusions to populate output dict
TypeError: pct_between() missing 2 required positional arguments: 'low' and 'high'
# 用閉包自定義聚合函數
In[40]: def make_agg_func(func, name, *args, **kwargs):
def wrapper(x):
return func(x, *args, **kwargs)
wrapper.__name__ = name
return wrapper
my_agg1 = make_agg_func(pct_between, 'pct_1_3k', low=1000, high=3000)
my_agg2 = make_agg_func(pct_between, 'pct_10_30k', 10000, 30000)['UGDS'].agg(pct_between, 1000, high=10000).head(9)
Out[41]:
6. 檢查分組對象
# 查看分組對象的類型
In[42]: college = pd.read_csv('data/college.csv')
grouped = college.groupby(['STABBR', 'RELAFFIL'])
type(grouped)
Out[42]: pandas.core.groupby.DataFrameGroupBy
# 用dir函數找到該對象所有的可用函數
In[43]: print([attr for attr in dir(grouped) if not attr.startswith('_')])
['CITY', 'CURROPER', 'DISTANCEONLY', 'GRAD_DEBT_MDN_SUPP', 'HBCU', 'INSTNM', 'MD_EARN_WNE_P10', 'MENONLY', 'PCTFLOAN', 'PCTPELL', 'PPTUG_EF', 'RELAFFIL', 'SATMTMID', 'SATVRMID', 'STABBR', 'UG25ABV', 'UGDS', 'UGDS_2MOR', 'UGDS_AIAN', 'UGDS_ASIAN', 'UGDS_BLACK', 'UGDS_HISP', 'UGDS_NHPI', 'UGDS_NRA', 'UGDS_UNKN', 'UGDS_WHITE', 'WOMENONLY', 'agg', 'aggregate', 'all', 'any', 'apply', 'backfill', 'bfill', 'boxplot', 'corr', 'corrwith', 'count', 'cov', 'cumcount', 'cummax', 'cummin', 'cumprod', 'cumsum', 'describe', 'diff', 'dtypes', 'expanding', 'ffill', 'fillna', 'filter', 'first', 'get_group', 'groups', 'head', 'hist', 'idxmax', 'idxmin', 'indices', 'last', 'mad', 'max', 'mean', 'median', 'min', 'ndim', 'ngroup', 'ngroups', 'nth', 'nunique', 'ohlc', 'pad', 'pct_change', 'plot', 'prod', 'quantile', 'rank', 'resample', 'rolling', 'sem', 'shift', 'size', 'skew', 'std', 'sum', 'tail', 'take', 'transform', 'tshift', 'var']
# 用ngroups屬性查看分組的數量
In[44]: grouped.ngroups
Out[44]: 112
# 查看每個分組的唯一識別標簽顷蟆,groups屬性是一個字典诫隅,包含每個獨立分組與行索引標簽的對應
In[45]: groups = list(grouped.groups.keys())
groups[:6]
Out[45]: [('AK', 0), ('AK', 1), ('AL', 0), ('AL', 1), ('AR', 0), ('AR', 1)]
# 用get_group,傳入分組標簽的元組帐偎。例如逐纬,獲取佛羅里達州所有與宗教相關的學校
In[46]: grouped.get_group(('FL', 1)).head()
Out[46]:
# groupby對象是一個可迭代對象,可以挨個查看每個獨立分組
In[47]: from IPython.display import display
In[48]: i = 0
for name, group in grouped:
print(name)
display(group.head(2))
i += 1
if i == 5:
break
# groupby對象使用head方法削樊,可以在一個DataFrame鐘顯示每個分組的頭幾行
In[49]: grouped.head(2).head(6)
Out[49]:
更多
# nth方法可以選出每個分組指定行的數據豁生,下面選出的是第1行和最后1行
In[50]: grouped.nth([1, -1]).head(8)
Out[50]:
7. 過濾狀態(tài)
In[51]: college = pd.read_csv('data/college.csv', index_col='INSTNM')
grouped = college.groupby('STABBR')
grouped.ngroups
Out[51]: 59
# 這等于求出不同州的個數,nunique()可以得到同樣的結果
In[52]: college['STABBR'].nunique()
Out[52]: 59
# 自定義一個計算少數民族學生總比例的函數,如果比例大于閾值甸箱,還返回True
In[53]: def check_minority(df, threshold):
minority_pct = 1 - df['UGDS_WHITE']
total_minority = (df['UGDS'] * minority_pct).sum()
total_ugds = df['UGDS'].sum()
total_minority_pct = total_minority / total_ugds
return total_minority_pct > threshold
# grouped變量有一個filter方法眼刃,可以接收一個自定義函數,決定是否保留一個分組
In[54]: college_filtered = grouped.filter(check_minority, threshold=.5)
college_filtered.head()
Out[54]:
# 通過查看形狀摇肌,可以看到過濾了60%擂红,只有20個州的少數學生占據多數
In[55]: college.shape
Out[55]: (7535, 26)
In[56]: college_filtered.shape
Out[56]: (3028, 26)
In[57]: college_filtered['STABBR'].nunique()
Out[57]: 20
更多
# 用一些不同的閾值,檢查形狀和不同州的個數
In[58]: college_filtered_20 = grouped.filter(check_minority, threshold=.2)
college_filtered_20.shape
Out[58]: (7461, 26)
In[59]: college_filtered_20['STABBR'].nunique()
Out[59]: 57
In[60]: college_filtered_70 = grouped.filter(check_minority, threshold=.7)
college_filtered_70.shape
Out[60]: (957, 26)
In[61]: college_filtered_70['STABBR'].nunique()
Out[61]: 10
In[62]: college_filtered_95 = grouped.filter(check_minority, threshold=.95)
college_filtered_95.shape
Out[62]: (156, 26)
8. 減肥對賭
# 讀取減肥數據集围小,查看一月的數據
In[63]: weight_loss = pd.read_csv('data/weight_loss.csv')
weight_loss.query('Month == "Jan"')
Out[63]:
# 定義一個求減肥比例的函數
In[64]: def find_perc_loss(s):
return (s - s.iloc[0]) / s.iloc[0]
# 查看Bob在一月的減肥成果
In[65]: bob_jan = weight_loss.query('Name=="Bob" and Month=="Jan"')
find_perc_loss(bob_jan['Weight'])
Out[65]: 0 0.000000
2 -0.010309
4 -0.027491
6 -0.027491
Name: Weight, dtype: float64
# 對Name和Month進行分組昵骤,然后使用transform方法,傳入函數肯适,對數值進行轉換
In[66]: pcnt_loss = weight_loss.groupby(['Name', 'Month'])['Weight'].transform(find_perc_loss)
pcnt_loss.head(8)
Out[66]: 0 0.000000
1 0.000000
2 -0.010309
3 -0.040609
4 -0.027491
5 -0.040609
6 -0.027491
7 -0.035533
Name: Weight, dtype: float64
# transform之后的結果变秦,行數不變,可以賦值給原始DataFrame作為一個新列框舔;
# 為了縮短輸出蹦玫,只選擇Bob的前兩個月數據
In[67]: weight_loss['Perc Weight Loss'] = pcnt_loss.round(3)
weight_loss.query('Name=="Bob" and Month in ["Jan", "Feb"]')
Out[67]:
# 因為最重要的是每個月的第4周,只選擇第4周的數據
In[68]: week4 = weight_loss.query('Week == "Week 4"')
week4
Out[68]:
# 用pivot重構DataFrame刘绣,讓Amy和Bob的數據并排放置
In[69]: winner = week4.pivot(index='Month', columns='Name', values='Perc Weight Loss')
winner
Out[69]:
# 用where方法選出每月的贏家
In[70]: winner['Winner'] = np.where(winner['Amy'] < winner['Bob'], 'Amy', 'Bob')
winner.style.highlight_min(axis=1)
Out[70]:
# 用value_counts()返回最后的比分
In[71]: winner.Winner.value_counts()
Out[71]: Amy 3
Bob 1
Name: Winner, dtype: int64
更多
# Pandas默認是按字母排序的
In[72]: week4a = week4.copy()
month_chron = week4a['Month'].unique()
month_chron
Out[72]: array(['Jan', 'Feb', 'Mar', 'Apr'], dtype=object)
# 轉換為Categorical變量樱溉,可以做成按時間排序
In[73]: week4a['Month'] = pd.Categorical(week4a['Month'],
categories=month_chron,
ordered=True)
week4a.pivot(index='Month', columns='Name', values='Perc Weight Loss')
Out[73]:
9. 用apply計算每州的加權平均SAT分數
# 讀取college,'UGDS', 'SATMTMID', 'SATVRMID'三列如果有缺失值則刪除行
In[74]: college = pd.read_csv('data/college.csv')
subset = ['UGDS', 'SATMTMID', 'SATVRMID']
college2 = college.dropna(subset=subset)
college.shape
Out[74]: (7535, 27)
In[75]: college2.shape
Out[75]: (1184, 27)
# 自定義一個求SAT數學成績的加權平均值的函數
In[76]: def weighted_math_average(df):
weighted_math = df['UGDS'] * df['SATMTMID']
return int(weighted_math.sum() / df['UGDS'].sum())
# 按州分組纬凤,并調用apply方法福贞,傳入自定義函數
In[77]: college2.groupby('STABBR').apply(weighted_math_average).head()
Out[77]: STABBR
AK 503
AL 536
AR 529
AZ 569
CA 564
dtype: int64
# 效果同上
In[78]: college2.groupby('STABBR').agg(weighted_math_average).head()
Out[78]:
# 如果將列限制到SATMTMID,會報錯停士。這是因為不能訪問UGDS挖帘。
In[79]: college2.groupby('STABBR')['SATMTMID'].agg(weighted_math_average)
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc (pandas/_libs/index.c:5126)()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item (pandas/_libs/hashtable.c:14010)()
TypeError: an integer is required
During handling of the above exception, another exception occurred:
KeyError Traceback (most recent call last)
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/groupby.py in agg_series(self, obj, func)
2177 try:
-> 2178 return self._aggregate_series_fast(obj, func)
2179 except Exception:
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/groupby.py in _aggregate_series_fast(self, obj, func)
2197 dummy)
-> 2198 result, counts = grouper.get_result()
2199 return result, counts
pandas/_libs/src/reduce.pyx in pandas._libs.lib.SeriesGrouper.get_result (pandas/_libs/lib.c:39105)()
pandas/_libs/src/reduce.pyx in pandas._libs.lib.SeriesGrouper.get_result (pandas/_libs/lib.c:38888)()
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/groupby.py in <lambda>(x)
841 func = self._is_builtin_func(func)
--> 842 f = lambda x: func(x, *args, **kwargs)
843
<ipython-input-76-01eb90aa258d> in weighted_math_average(df)
1 def weighted_math_average(df):
----> 2 weighted_math = df['UGDS'] * df['SATMTMID']
3 return int(weighted_math.sum() / df['UGDS'].sum())
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/series.py in __getitem__(self, key)
600 try:
--> 601 result = self.index.get_value(self, key)
602
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/indexes/base.py in get_value(self, series, key)
2476 return self._engine.get_value(s, k,
-> 2477 tz=getattr(series.dtype, 'tz', None))
2478 except KeyError as e1:
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_value (pandas/_libs/index.c:4404)()
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_value (pandas/_libs/index.c:4087)()
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc (pandas/_libs/index.c:5210)()
KeyError: 'UGDS'
During handling of the above exception, another exception occurred:
TypeError Traceback (most recent call last)
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc (pandas/_libs/index.c:5126)()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item (pandas/_libs/hashtable.c:14010)()
TypeError: an integer is required
During handling of the above exception, another exception occurred:
KeyError Traceback (most recent call last)
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/groupby.py in aggregate(self, func_or_funcs, *args, **kwargs)
2882 try:
-> 2883 return self._python_agg_general(func_or_funcs, *args, **kwargs)
2884 except Exception:
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/groupby.py in _python_agg_general(self, func, *args, **kwargs)
847 try:
--> 848 result, counts = self.grouper.agg_series(obj, f)
849 output[name] = self._try_cast(result, obj, numeric_only=True)
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/groupby.py in agg_series(self, obj, func)
2179 except Exception:
-> 2180 return self._aggregate_series_pure_python(obj, func)
2181
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/groupby.py in _aggregate_series_pure_python(self, obj, func)
2210 for label, group in splitter:
-> 2211 res = func(group)
2212 if result is None:
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/groupby.py in <lambda>(x)
841 func = self._is_builtin_func(func)
--> 842 f = lambda x: func(x, *args, **kwargs)
843
<ipython-input-76-01eb90aa258d> in weighted_math_average(df)
1 def weighted_math_average(df):
----> 2 weighted_math = df['UGDS'] * df['SATMTMID']
3 return int(weighted_math.sum() / df['UGDS'].sum())
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/series.py in __getitem__(self, key)
600 try:
--> 601 result = self.index.get_value(self, key)
602
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/indexes/base.py in get_value(self, series, key)
2476 return self._engine.get_value(s, k,
-> 2477 tz=getattr(series.dtype, 'tz', None))
2478 except KeyError as e1:
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_value (pandas/_libs/index.c:4404)()
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_value (pandas/_libs/index.c:4087)()
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc (pandas/_libs/index.c:5210)()
KeyError: 'UGDS'
During handling of the above exception, another exception occurred:
TypeError Traceback (most recent call last)
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc (pandas/_libs/index.c:5126)()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item (pandas/_libs/hashtable.c:14010)()
TypeError: an integer is required
During handling of the above exception, another exception occurred:
KeyError Traceback (most recent call last)
<ipython-input-79-1351e4f306c7> in <module>()
----> 1 college2.groupby('STABBR')['SATMTMID'].agg(weighted_math_average)
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/groupby.py in aggregate(self, func_or_funcs, *args, **kwargs)
2883 return self._python_agg_general(func_or_funcs, *args, **kwargs)
2884 except Exception:
-> 2885 result = self._aggregate_named(func_or_funcs, *args, **kwargs)
2886
2887 index = Index(sorted(result), name=self.grouper.names[0])
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/groupby.py in _aggregate_named(self, func, *args, **kwargs)
3013 for name, group in self:
3014 group.name = name
-> 3015 output = func(group, *args, **kwargs)
3016 if isinstance(output, (Series, Index, np.ndarray)):
3017 raise Exception('Must produce aggregated value')
<ipython-input-76-01eb90aa258d> in weighted_math_average(df)
1 def weighted_math_average(df):
----> 2 weighted_math = df['UGDS'] * df['SATMTMID']
3 return int(weighted_math.sum() / df['UGDS'].sum())
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/series.py in __getitem__(self, key)
599 key = com._apply_if_callable(key, self)
600 try:
--> 601 result = self.index.get_value(self, key)
602
603 if not is_scalar(result):
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/indexes/base.py in get_value(self, series, key)
2475 try:
2476 return self._engine.get_value(s, k,
-> 2477 tz=getattr(series.dtype, 'tz', None))
2478 except KeyError as e1:
2479 if len(self) > 0 and self.inferred_type in ['integer', 'boolean']:
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_value (pandas/_libs/index.c:4404)()
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_value (pandas/_libs/index.c:4087)()
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc (pandas/_libs/index.c:5210)()
KeyError: 'UGDS'
# apply的一個不錯的功能是通過返回Series,創(chuàng)建多個新的列
In[80]: from collections import OrderedDict
def weighted_average(df):
data = OrderedDict()
weight_m = df['UGDS'] * df['SATMTMID']
weight_v = df['UGDS'] * df['SATVRMID']
data['weighted_math_avg'] = weight_m.sum() / df['UGDS'].sum()
data['weighted_verbal_avg'] = weight_v.sum() / df['UGDS'].sum()
data['math_avg'] = df['SATMTMID'].mean()
data['verbal_avg'] = df['SATVRMID'].mean()
data['count'] = len(df)
return pd.Series(data, dtype='int')
college2.groupby('STABBR').apply(weighted_average).head(10)
Out[80]:
# 多創(chuàng)建兩個新的列
In[81]: from collections import OrderedDict
def weighted_average(df):
data = OrderedDict()
weight_m = df['UGDS'] * df['SATMTMID']
weight_v = df['UGDS'] * df['SATVRMID']
wm_avg = weight_m.sum() / df['UGDS'].sum()
wv_avg = weight_v.sum() / df['UGDS'].sum()
data['weighted_math_avg'] = wm_avg
data['weighted_verbal_avg'] = wv_avg
data['math_avg'] = df['SATMTMID'].mean()
data['verbal_avg'] = df['SATVRMID'].mean()
data['count'] = len(df)
return pd.Series(data, dtype='int')
college2.groupby('STABBR').apply(weighted_average).head(10)
Out[81]:
更多
# 自定義一個返回DataFrame的函數恋技,使用NumPy的函數average計算加權平均值拇舀,使用SciPy的gmean和hmean計算幾何和調和平均值
In[82]: from scipy.stats import gmean, hmean
def calculate_means(df):
df_means = pd.DataFrame(index=['Arithmetic', 'Weighted', 'Geometric', 'Harmonic'])
cols = ['SATMTMID', 'SATVRMID']
for col in cols:
arithmetic = df[col].mean()
weighted = np.average(df[col], weights=df['UGDS'])
geometric = gmean(df[col])
harmonic = hmean(df[col])
df_means[col] = [arithmetic, weighted, geometric, harmonic]
df_means['count'] = len(df)
return df_means.astype(int)
college2.groupby('STABBR').filter(lambda x: len(x) != 1).groupby('STABBR').apply(calculate_means).head(10)
Out[82]:
10. 用連續(xù)變量分組
In[83]: flights = pd.read_csv('data/flights.csv')
flights.head()
Out[83]:
# 判斷DIST列有無缺失值
In[84]: flights.DIST.hasnans
Out[84]: False
# 再次刪除DIST列的缺失值(原書是沒有這兩段的)
In[85]: flights.dropna(subset=['DIST']).shape
Out[85]: (58492, 14)
# 使用Pandas的cut函數,將數據分成5個面元
In[86]: bins = [-np.inf, 200, 500, 1000, 2000, np.inf]
cuts = pd.cut(flights['DIST'], bins=bins)
cuts.head()
Out[86]: 0 (500.0, 1000.0]
1 (1000.0, 2000.0]
2 (500.0, 1000.0]
3 (1000.0, 2000.0]
4 (1000.0, 2000.0]
Name: DIST, dtype: category
Categories (5, interval[float64]): [(-inf, 200.0] < (200.0, 500.0] < (500.0, 1000.0] < (1000.0, 2000.0] < (2000.0, inf]]
# 對每個面元進行統計
In[87]: cuts.value_counts()
Out[87]: (500.0, 1000.0] 20659
(200.0, 500.0] 15874
(1000.0, 2000.0] 14186
(2000.0, inf] 4054
(-inf, 200.0] 3719
Name: DIST, dtype: int64
# 面元Series可以用來進行分組
In[88]: flights.groupby(cuts)['AIRLINE'].value_counts(normalize=True).round(3).head(15)
Out[88]: DIST AIRLINE
(-inf, 200.0] OO 0.326
EV 0.289
MQ 0.211
DL 0.086
AA 0.052
UA 0.027
WN 0.009
(200.0, 500.0] WN 0.194
DL 0.189
OO 0.159
EV 0.156
MQ 0.100
AA 0.071
UA 0.062
VX 0.028
Name: AIRLINE, dtype: float64
原理
In[89]: flights.groupby(cuts)['AIRLINE'].value_counts(normalize=True)['AIRLINE'].value_counts(normalize=True).round(3).head(15)
Out[89]:
DIST AIRLINE
(-inf, 200.0] OO 0.325625
EV 0.289325
MQ 0.210809
DL 0.086045
AA 0.052165
UA 0.027427
WN 0.008604
(200.0, 500.0] WN 0.193902
DL 0.188736
OO 0.158687
EV 0.156293
MQ 0.100164
AA 0.071375
UA 0.062051
VX 0.028222
US 0.016001
NK 0.011843
B6 0.006867
F9 0.004914
AS 0.000945
(500.0, 1000.0] DL 0.205625
AA 0.143908
WN 0.138196
UA 0.131129
OO 0.106443
EV 0.100683
MQ 0.051213
F9 0.038192
NK 0.029527
US 0.025316
AS 0.023234
VX 0.003582
B6 0.002953
(1000.0, 2000.0] AA 0.263781
UA 0.199070
DL 0.165092
WN 0.159664
OO 0.046454
NK 0.045115
US 0.040462
F9 0.030664
AS 0.015931
EV 0.015579
VX 0.012125
B6 0.003313
MQ 0.002749
(2000.0, inf] UA 0.289097
AA 0.211643
DL 0.171436
B6 0.080414
VX 0.073754
US 0.065121
WN 0.046374
HA 0.027627
NK 0.019240
AS 0.011593
F9 0.003700
Name: AIRLINE, dtype: float64
更多
# 求飛行時間的0.25蜻底,0.5骄崩,0.75分位數
In[90]: flights.groupby(cuts)['AIR_TIME'].quantile(q=[.25, .5, .75]).div(60).round(2)
Out[90]: DIST
(-inf, 200.0] 0.25 0.43
0.50 0.50
0.75 0.57
(200.0, 500.0] 0.25 0.77
0.50 0.92
0.75 1.05
(500.0, 1000.0] 0.25 1.43
0.50 1.65
0.75 1.92
(1000.0, 2000.0] 0.25 2.50
0.50 2.93
0.75 3.40
(2000.0, inf] 0.25 4.30
0.50 4.70
0.75 5.03
Name: AIR_TIME, dtype: float64
# unstack方法可以將內層的索引變?yōu)榱忻? In[91]: labels=['Under an Hour', '1 Hour', '1-2 Hours', '2-4 Hours', '4+ Hours']
cuts2 = pd.cut(flights['DIST'], bins=bins, labels=labels)
flights.groupby(cuts2)['AIRLINE'].value_counts(normalize=True).round(3).unstack().style.highlight_max(axis=1)
Out[91]:
11. 計算城市之間的航班總數
In[92]: flights = pd.read_csv('data/flights.csv')
flights.head()
Out[92]:
# 求每兩個城市間的航班總數
In[93]: flights_ct = flights.groupby(['ORG_AIR', 'DEST_AIR']).size()
flights_ct.head()
Out[93]: ORG_AIR DEST_AIR
ATL ABE 31
ABQ 16
ABY 19
ACY 6
AEX 40
dtype: int64
# 選出休斯頓(IAH)和亞特蘭大(ATL)之間雙方向的航班總數
In[94]: flights_ct.loc[[('ATL', 'IAH'), ('IAH', 'ATL')]]
Out[94]: ORG_AIR DEST_AIR
ATL IAH 121
IAH ATL 148
dtype: int64
# 分別對每行按照出發(fā)地和目的地,按字母排序
In[95]: flights_sort = flights[['ORG_AIR', 'DEST_AIR']].apply(sorted, axis=1)
flights_sort.head()
Out[95]:
# 因為現在每行都是獨立排序的朱躺,列名存在問題刁赖。對列重命名,然后再計算所有城市間的航班數
In[96]: rename_dict = {'ORG_AIR':'AIR1','DEST_AIR':'AIR2'}
flights_sort = flights_sort.rename(columns=rename_dict)
flights_ct2 = flights_sort.groupby(['AIR1', 'AIR2']).size()
flights_ct2.head()
Out[96]: AIR1 AIR2
ABE ATL 31
ORD 24
ABI DFW 74
ABQ ATL 16
DEN 46
dtype: int64
# 找到亞特蘭大和休斯頓之間的航班數
In[97]: flights_ct2.loc[('ATL', 'IAH')]
Out[97]: 269
# 如果調換順序长搀,則會出錯
In[98]: flights_ct2.loc[('IAH', 'ATL')]
---------------------------------------------------------------------------
IndexingError Traceback (most recent call last)
<ipython-input-98-56147a7d0bb5> in <module>()
----> 1 flights_ct2.loc[('IAH', 'ATL')]
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/indexing.py in __getitem__(self, key)
1323 except (KeyError, IndexError):
1324 pass
-> 1325 return self._getitem_tuple(key)
1326 else:
1327 key = com._apply_if_callable(key, self.obj)
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/indexing.py in _getitem_tuple(self, tup)
839
840 # no multi-index, so validate all of the indexers
--> 841 self._has_valid_tuple(tup)
842
843 # ugly hack for GH #836
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/indexing.py in _has_valid_tuple(self, key)
186 for i, k in enumerate(key):
187 if i >= self.obj.ndim:
--> 188 raise IndexingError('Too many indexers')
189 if not self._has_valid_type(k, i):
190 raise ValueError("Location based indexing can only have [%s] "
IndexingError: Too many indexers
更多
# 用NumPy的sort函數可以大大提高速度
In[99]: data_sorted = np.sort(flights[['ORG_AIR', 'DEST_AIR']])
data_sorted[:10]
Out[99]: array([['LAX', 'SLC'],
['DEN', 'IAD'],
['DFW', 'VPS'],
['DCA', 'DFW'],
['LAX', 'MCI'],
['IAH', 'SAN'],
['DFW', 'MSY'],
['PHX', 'SFO'],
['ORD', 'STL'],
['IAH', 'SJC']], dtype=object)
# 重新用DataFrame構造器創(chuàng)建一個DataFrame宇弛,檢測其是否與flights_sorted相等
In[100]: flights_sort2 = pd.DataFrame(data_sorted, columns=['AIR1', 'AIR2'])
fs_orig = flights_sort.rename(columns={'ORG_AIR':'AIR1', 'DEST_AIR':'AIR2'})
flights_sort2.equals(fs_orig)
Out[100]: True
# 比較速度
In[101]: %timeit flights_sort = flights[['ORG_AIR', 'DEST_AIR']].apply(sorted, axis=1)
7.82 s ± 189 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
In[102]: %%timeit
data_sorted = np.sort(flights[['ORG_AIR', 'DEST_AIR']])
flights_sort2 = pd.DataFrame(data_sorted, columns=['AIR1', 'AIR2'])
10.9 ms ± 325 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)
12. 找到持續(xù)最長的準時航班
# 創(chuàng)建一個Series
In[103]: s = pd.Series([1, 1, 1, 0, 1, 1, 1, 0])
s
Out[103]: 0 1
1 1
2 1
3 0
4 1
5 1
6 1
7 0
dtype: int64
# 累積求和
In[104]: s1 = s.cumsum()
s1
Out[104]: 0 1
1 2
2 3
3 3
4 4
5 5
6 6
7 6
dtype: int64
In[105]: s.mul(s1).diff()
Out[105]: 0 NaN
1 1.0
2 1.0
3 -3.0
4 4.0
5 1.0
6 1.0
7 -6.0
dtype: float64
# 將所有非負值變?yōu)槿笔е? In[106]: s.mul(s1).diff().where(lambda x: x < 0)
Out[106]: 0 NaN
1 NaN
2 NaN
3 -3.0
4 NaN
5 NaN
6 NaN
7 -6.0
dtype: float64
In[107]: s.mul(s1).diff().where(lambda x: x < 0).ffill().add(s1, fill_value=0)
Out[107]: 0 1.0
1 2.0
2 3.0
3 0.0
4 1.0
5 2.0
6 3.0
7 0.0
dtype: float64
# 創(chuàng)建一個準時的列 ON_TIME
In[108]: flights = pd.read_csv('data/flights.csv')
flights['ON_TIME'] = flights['ARR_DELAY'].lt(15).astype(int)
flights[['AIRLINE', 'ORG_AIR', 'ON_TIME']].head(10)
Out[108]:
# 將之前的邏輯做成一個函數
In[109]: def max_streak(s):
s1 = s.cumsum()
return s.mul(s1).diff().where(lambda x: x < 0) \
.ffill().add(s1, fill_value=0).max()
In[110]: flights.sort_values(['MONTH', 'DAY', 'SCHED_DEP']) \
.groupby(['AIRLINE', 'ORG_AIR'])['ON_TIME'] \
.agg(['mean', 'size', max_streak]).round(2).head()
Out[110]:
更多
# 求最長的延誤航班
In[111]: def max_delay_streak(df):
df = df.reset_index(drop=True)
s = 1 - df['ON_TIME']
s1 = s.cumsum()
streak = s.mul(s1).diff().where(lambda x: x < 0) \
.ffill().add(s1, fill_value=0)
last_idx = streak.idxmax()
first_idx = last_idx - streak.max() + 1
df_return = df.loc[[first_idx, last_idx], ['MONTH', 'DAY']]
df_return['streak'] = streak.max()
df_return.index = ['first', 'last']
df_return.index.name='streak_row'
return df_return
In[112]: flights.sort_values(['MONTH', 'DAY', 'SCHED_DEP']) \
.groupby(['AIRLINE', 'ORG_AIR']) \
.apply(max_delay_streak) \
.sort_values(['streak','MONTH','DAY'], ascending=[False, True, True]).head(10)
Out[112]:
第01章 Pandas基礎
第02章 DataFrame運算
第03章 數據分析入門
第04章 選取數據子集
第05章 布爾索引
第06章 索引對齊
第07章 分組聚合、過濾源请、轉換
第08章 數據清理
第09章 合并Pandas對象
第10章 時間序列分析
第11章 用Matplotlib枪芒、Pandas彻况、Seaborn進行可視化