1. 定义聚合
# 读取flights数据集,查询头部
In[2]: flights = pd.read_csv('data/flights.csv')
flights.head()
Out[2]:
# 按照AIRLINE分组,使用agg方法,传入要聚合的列和聚合函数
In[3]: flights.groupby('AIRLINE').agg({'ARR_DELAY':'mean'}).head()
Out[3]:
# 或者要选取的列使用索引,聚合函数作为字符串传入agg
In[4]: flights.groupby('AIRLINE')['ARR_DELAY'].agg('mean').head()
Out[4]:
AIRLINE
AA 5.542661
AS -0.833333
B6 8.692593
DL 0.339691
EV 7.034580
Name: ARR_DELAY, dtype: float64
# 也可以向agg中传入NumPy的mean函数
In[5]: flights.groupby('AIRLINE')['ARR_DELAY'].agg(np.mean).head()
Out[5]:
# 也可以直接使用mean()函数
In[6]: flights.groupby('AIRLINE')['ARR_DELAY'].mean().head()
Out[6]:
原理
# groupby方法产生的是一个DataFrameGroupBy对象
In[7]: grouped = flights.groupby('AIRLINE')
type(grouped)
Out[7]: pandas.core.groupby.DataFrameGroupBy
更多
# 如果agg接收的不是聚合函数,则会导致异常
In[8]: flights.groupby('AIRLINE')['ARR_DELAY'].agg(np.sqrt)
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/groupby.py:842: RuntimeWarning: invalid value encountered in sqrt
f = lambda x: func(x, *args, **kwargs)
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/groupby.py:3015: RuntimeWarning: invalid value encountered in sqrt
output = func(group, *args, **kwargs)
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/groupby.py in agg_series(self, obj, func)
2177 try:
-> 2178 return self._aggregate_series_fast(obj, func)
2179 except Exception:
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/groupby.py in _aggregate_series_fast(self, obj, func)
2197 dummy)
-> 2198 result, counts = grouper.get_result()
2199 return result, counts
pandas/_libs/src/reduce.pyx in pandas._libs.lib.SeriesGrouper.get_result (pandas/_libs/lib.c:39105)()
pandas/_libs/src/reduce.pyx in pandas._libs.lib.SeriesGrouper.get_result (pandas/_libs/lib.c:38973)()
pandas/_libs/src/reduce.pyx in pandas._libs.lib._get_result_array (pandas/_libs/lib.c:32039)()
ValueError: function does not reduce
During handling of the above exception, another exception occurred:
ValueError Traceback (most recent call last)
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/groupby.py in aggregate(self, func_or_funcs, *args, **kwargs)
2882 try:
-> 2883 return self._python_agg_general(func_or_funcs, *args, **kwargs)
2884 except Exception:
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/groupby.py in _python_agg_general(self, func, *args, **kwargs)
847 try:
--> 848 result, counts = self.grouper.agg_series(obj, f)
849 output[name] = self._try_cast(result, obj, numeric_only=True)
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/groupby.py in agg_series(self, obj, func)
2179 except Exception:
-> 2180 return self._aggregate_series_pure_python(obj, func)
2181
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/groupby.py in _aggregate_series_pure_python(self, obj, func)
2214 isinstance(res, list)):
-> 2215 raise ValueError('Function does not reduce')
2216 result = np.empty(ngroups, dtype='O')
ValueError: Function does not reduce
During handling of the above exception, another exception occurred:
Exception Traceback (most recent call last)
<ipython-input-8-2bcc9ccfec77> in <module>()
----> 1 flights.groupby('AIRLINE')['ARR_DELAY'].agg(np.sqrt)
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/groupby.py in aggregate(self, func_or_funcs, *args, **kwargs)
2883 return self._python_agg_general(func_or_funcs, *args, **kwargs)
2884 except Exception:
-> 2885 result = self._aggregate_named(func_or_funcs, *args, **kwargs)
2886
2887 index = Index(sorted(result), name=self.grouper.names[0])
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/groupby.py in _aggregate_named(self, func, *args, **kwargs)
3015 output = func(group, *args, **kwargs)
3016 if isinstance(output, (Series, Index, np.ndarray)):
-> 3017 raise Exception('Must produce aggregated value')
3018 result[name] = self._try_cast(output, group)
3019
Exception: Must produce aggregated value