9. 用apply计算每州的加权平均SAT分数

  1. # 读取college,'UGDS', 'SATMTMID', 'SATVRMID'三列如果有缺失值则删除行
  2. In[74]: college = pd.read_csv('data/college.csv')
  3. subset = ['UGDS', 'SATMTMID', 'SATVRMID']
  4. college2 = college.dropna(subset=subset)
  5. college.shape
  6. Out[74]: (7535, 27)
  7. In[75]: college2.shape
  8. Out[75]: (1184, 27)
  1. # 自定义一个求SAT数学成绩的加权平均值的函数
  2. In[76]: def weighted_math_average(df):
  3. weighted_math = df['UGDS'] * df['SATMTMID']
  4. return int(weighted_math.sum() / df['UGDS'].sum())
  5. # 按州分组,并调用apply方法,传入自定义函数
  6. In[77]: college2.groupby('STABBR').apply(weighted_math_average).head()
  7. Out[77]: STABBR
  8. AK 503
  9. AL 536
  10. AR 529
  11. AZ 569
  12. CA 564
  13. dtype: int64
  1. # 效果同上
  2. In[78]: college2.groupby('STABBR').agg(weighted_math_average).head()
  3. Out[78]:

9. 用apply计算每州的加权平均SAT分数 - 图1

  1. # 如果将列限制到SATMTMID,会报错。这是因为不能访问UGDS。
  2. In[79]: college2.groupby('STABBR')['SATMTMID'].agg(weighted_math_average)
  3. ---------------------------------------------------------------------------
  4. TypeError Traceback (most recent call last)
  5. pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc (pandas/_libs/index.c:5126)()
  6. pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item (pandas/_libs/hashtable.c:14010)()
  7. TypeError: an integer is required
  8. During handling of the above exception, another exception occurred:
  9. KeyError Traceback (most recent call last)
  10. /Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/groupby.py in agg_series(self, obj, func)
  11. 2177 try:
  12. -> 2178 return self._aggregate_series_fast(obj, func)
  13. 2179 except Exception:
  14. /Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/groupby.py in _aggregate_series_fast(self, obj, func)
  15. 2197 dummy)
  16. -> 2198 result, counts = grouper.get_result()
  17. 2199 return result, counts
  18. pandas/_libs/src/reduce.pyx in pandas._libs.lib.SeriesGrouper.get_result (pandas/_libs/lib.c:39105)()
  19. pandas/_libs/src/reduce.pyx in pandas._libs.lib.SeriesGrouper.get_result (pandas/_libs/lib.c:38888)()
  20. /Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/groupby.py in <lambda>(x)
  21. 841 func = self._is_builtin_func(func)
  22. --> 842 f = lambda x: func(x, *args, **kwargs)
  23. 843
  24. <ipython-input-76-01eb90aa258d> in weighted_math_average(df)
  25. 1 def weighted_math_average(df):
  26. ----> 2 weighted_math = df['UGDS'] * df['SATMTMID']
  27. 3 return int(weighted_math.sum() / df['UGDS'].sum())
  28. /Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/series.py in __getitem__(self, key)
  29. 600 try:
  30. --> 601 result = self.index.get_value(self, key)
  31. 602
  32. /Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/indexes/base.py in get_value(self, series, key)
  33. 2476 return self._engine.get_value(s, k,
  34. -> 2477 tz=getattr(series.dtype, 'tz', None))
  35. 2478 except KeyError as e1:
  36. pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_value (pandas/_libs/index.c:4404)()
  37. pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_value (pandas/_libs/index.c:4087)()
  38. pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc (pandas/_libs/index.c:5210)()
  39. KeyError: 'UGDS'
  40. During handling of the above exception, another exception occurred:
  41. TypeError Traceback (most recent call last)
  42. pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc (pandas/_libs/index.c:5126)()
  43. pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item (pandas/_libs/hashtable.c:14010)()
  44. TypeError: an integer is required
  45. During handling of the above exception, another exception occurred:
  46. KeyError Traceback (most recent call last)
  47. /Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/groupby.py in aggregate(self, func_or_funcs, *args, **kwargs)
  48. 2882 try:
  49. -> 2883 return self._python_agg_general(func_or_funcs, *args, **kwargs)
  50. 2884 except Exception:
  51. /Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/groupby.py in _python_agg_general(self, func, *args, **kwargs)
  52. 847 try:
  53. --> 848 result, counts = self.grouper.agg_series(obj, f)
  54. 849 output[name] = self._try_cast(result, obj, numeric_only=True)
  55. /Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/groupby.py in agg_series(self, obj, func)
  56. 2179 except Exception:
  57. -> 2180 return self._aggregate_series_pure_python(obj, func)
  58. 2181
  59. /Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/groupby.py in _aggregate_series_pure_python(self, obj, func)
  60. 2210 for label, group in splitter:
  61. -> 2211 res = func(group)
  62. 2212 if result is None:
  63. /Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/groupby.py in <lambda>(x)
  64. 841 func = self._is_builtin_func(func)
  65. --> 842 f = lambda x: func(x, *args, **kwargs)
  66. 843
  67. <ipython-input-76-01eb90aa258d> in weighted_math_average(df)
  68. 1 def weighted_math_average(df):
  69. ----> 2 weighted_math = df['UGDS'] * df['SATMTMID']
  70. 3 return int(weighted_math.sum() / df['UGDS'].sum())
  71. /Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/series.py in __getitem__(self, key)
  72. 600 try:
  73. --> 601 result = self.index.get_value(self, key)
  74. 602
  75. /Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/indexes/base.py in get_value(self, series, key)
  76. 2476 return self._engine.get_value(s, k,
  77. -> 2477 tz=getattr(series.dtype, 'tz', None))
  78. 2478 except KeyError as e1:
  79. pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_value (pandas/_libs/index.c:4404)()
  80. pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_value (pandas/_libs/index.c:4087)()
  81. pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc (pandas/_libs/index.c:5210)()
  82. KeyError: 'UGDS'
  83. During handling of the above exception, another exception occurred:
  84. TypeError Traceback (most recent call last)
  85. pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc (pandas/_libs/index.c:5126)()
  86. pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item (pandas/_libs/hashtable.c:14010)()
  87. TypeError: an integer is required
  88. During handling of the above exception, another exception occurred:
  89. KeyError Traceback (most recent call last)
  90. <ipython-input-79-1351e4f306c7> in <module>()
  91. ----> 1 college2.groupby('STABBR')['SATMTMID'].agg(weighted_math_average)
  92. /Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/groupby.py in aggregate(self, func_or_funcs, *args, **kwargs)
  93. 2883 return self._python_agg_general(func_or_funcs, *args, **kwargs)
  94. 2884 except Exception:
  95. -> 2885 result = self._aggregate_named(func_or_funcs, *args, **kwargs)
  96. 2886
  97. 2887 index = Index(sorted(result), name=self.grouper.names[0])
  98. /Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/groupby.py in _aggregate_named(self, func, *args, **kwargs)
  99. 3013 for name, group in self:
  100. 3014 group.name = name
  101. -> 3015 output = func(group, *args, **kwargs)
  102. 3016 if isinstance(output, (Series, Index, np.ndarray)):
  103. 3017 raise Exception('Must produce aggregated value')
  104. <ipython-input-76-01eb90aa258d> in weighted_math_average(df)
  105. 1 def weighted_math_average(df):
  106. ----> 2 weighted_math = df['UGDS'] * df['SATMTMID']
  107. 3 return int(weighted_math.sum() / df['UGDS'].sum())
  108. /Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/series.py in __getitem__(self, key)
  109. 599 key = com._apply_if_callable(key, self)
  110. 600 try:
  111. --> 601 result = self.index.get_value(self, key)
  112. 602
  113. 603 if not is_scalar(result):
  114. /Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/indexes/base.py in get_value(self, series, key)
  115. 2475 try:
  116. 2476 return self._engine.get_value(s, k,
  117. -> 2477 tz=getattr(series.dtype, 'tz', None))
  118. 2478 except KeyError as e1:
  119. 2479 if len(self) > 0 and self.inferred_type in ['integer', 'boolean']:
  120. pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_value (pandas/_libs/index.c:4404)()
  121. pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_value (pandas/_libs/index.c:4087)()
  122. pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc (pandas/_libs/index.c:5210)()
  123. KeyError: 'UGDS'
  1. # apply的一个不错的功能是通过返回Series,创建多个新的列
  2. In[80]: from collections import OrderedDict
  3. def weighted_average(df):
  4. data = OrderedDict()
  5. weight_m = df['UGDS'] * df['SATMTMID']
  6. weight_v = df['UGDS'] * df['SATVRMID']
  7. data['weighted_math_avg'] = weight_m.sum() / df['UGDS'].sum()
  8. data['weighted_verbal_avg'] = weight_v.sum() / df['UGDS'].sum()
  9. data['math_avg'] = df['SATMTMID'].mean()
  10. data['verbal_avg'] = df['SATVRMID'].mean()
  11. data['count'] = len(df)
  12. return pd.Series(data, dtype='int')
  13. college2.groupby('STABBR').apply(weighted_average).head(10)
  14. Out[80]:

9. 用apply计算每州的加权平均SAT分数 - 图2

  1. # 多创建两个新的列
  2. In[81]: from collections import OrderedDict
  3. def weighted_average(df):
  4. data = OrderedDict()
  5. weight_m = df['UGDS'] * df['SATMTMID']
  6. weight_v = df['UGDS'] * df['SATVRMID']
  7. wm_avg = weight_m.sum() / df['UGDS'].sum()
  8. wv_avg = weight_v.sum() / df['UGDS'].sum()
  9. data['weighted_math_avg'] = wm_avg
  10. data['weighted_verbal_avg'] = wv_avg
  11. data['math_avg'] = df['SATMTMID'].mean()
  12. data['verbal_avg'] = df['SATVRMID'].mean()
  13. data['count'] = len(df)
  14. return pd.Series(data, dtype='int')
  15. college2.groupby('STABBR').apply(weighted_average).head(10)
  16. Out[81]:

9. 用apply计算每州的加权平均SAT分数 - 图3

更多

  1. # 自定义一个返回DataFrame的函数,使用NumPy的函数average计算加权平均值,使用SciPy的gmean和hmean计算几何和调和平均值
  2. In[82]: from scipy.stats import gmean, hmean
  3. def calculate_means(df):
  4. df_means = pd.DataFrame(index=['Arithmetic', 'Weighted', 'Geometric', 'Harmonic'])
  5. cols = ['SATMTMID', 'SATVRMID']
  6. for col in cols:
  7. arithmetic = df[col].mean()
  8. weighted = np.average(df[col], weights=df['UGDS'])
  9. geometric = gmean(df[col])
  10. harmonic = hmean(df[col])
  11. df_means[col] = [arithmetic, weighted, geometric, harmonic]
  12. df_means['count'] = len(df)
  13. return df_means.astype(int)
  14. college2.groupby('STABBR').filter(lambda x: len(x) != 1).groupby('STABBR').apply(calculate_means).head(10)
  15. Out[82]:

9. 用apply计算每州的加权平均SAT分数 - 图4