10. 用连续变量分组

  1. In[83]: flights = pd.read_csv('data/flights.csv')
  2. flights.head()
  3. Out[83]:

10. 用连续变量分组 - 图1

  1. # 判断DIST列有无缺失值
  2. In[84]: flights.DIST.hasnans
  3. Out[84]: False
  1. # 再次删除DIST列的缺失值(原书是没有这两段的)
  2. In[85]: flights.dropna(subset=['DIST']).shape
  3. Out[85]: (58492, 14)
  1. # 使用Pandas的cut函数,将数据分成5个面元
  2. In[86]: bins = [-np.inf, 200, 500, 1000, 2000, np.inf]
  3. cuts = pd.cut(flights['DIST'], bins=bins)
  4. cuts.head()
  5. Out[86]: 0 (500.0, 1000.0]
  6. 1 (1000.0, 2000.0]
  7. 2 (500.0, 1000.0]
  8. 3 (1000.0, 2000.0]
  9. 4 (1000.0, 2000.0]
  10. Name: DIST, dtype: category
  11. Categories (5, interval[float64]): [(-inf, 200.0] < (200.0, 500.0] < (500.0, 1000.0] < (1000.0, 2000.0] < (2000.0, inf]]
  1. # 对每个面元进行统计
  2. In[87]: cuts.value_counts()
  3. Out[87]: (500.0, 1000.0] 20659
  4. (200.0, 500.0] 15874
  5. (1000.0, 2000.0] 14186
  6. (2000.0, inf] 4054
  7. (-inf, 200.0] 3719
  8. Name: DIST, dtype: int64
  1. # 面元Series可以用来进行分组
  2. In[88]: flights.groupby(cuts)['AIRLINE'].value_counts(normalize=True).round(3).head(15)
  3. Out[88]: DIST AIRLINE
  4. (-inf, 200.0] OO 0.326
  5. EV 0.289
  6. MQ 0.211
  7. DL 0.086
  8. AA 0.052
  9. UA 0.027
  10. WN 0.009
  11. (200.0, 500.0] WN 0.194
  12. DL 0.189
  13. OO 0.159
  14. EV 0.156
  15. MQ 0.100
  16. AA 0.071
  17. UA 0.062
  18. VX 0.028
  19. Name: AIRLINE, dtype: float64

原理

  1. In[89]: flights.groupby(cuts)['AIRLINE'].value_counts(normalize=True)['AIRLINE'].value_counts(normalize=True).round(3).head(15)
  2. Out[89]:
  3. DIST AIRLINE
  4. (-inf, 200.0] OO 0.325625
  5. EV 0.289325
  6. MQ 0.210809
  7. DL 0.086045
  8. AA 0.052165
  9. UA 0.027427
  10. WN 0.008604
  11. (200.0, 500.0] WN 0.193902
  12. DL 0.188736
  13. OO 0.158687
  14. EV 0.156293
  15. MQ 0.100164
  16. AA 0.071375
  17. UA 0.062051
  18. VX 0.028222
  19. US 0.016001
  20. NK 0.011843
  21. B6 0.006867
  22. F9 0.004914
  23. AS 0.000945
  24. (500.0, 1000.0] DL 0.205625
  25. AA 0.143908
  26. WN 0.138196
  27. UA 0.131129
  28. OO 0.106443
  29. EV 0.100683
  30. MQ 0.051213
  31. F9 0.038192
  32. NK 0.029527
  33. US 0.025316
  34. AS 0.023234
  35. VX 0.003582
  36. B6 0.002953
  37. (1000.0, 2000.0] AA 0.263781
  38. UA 0.199070
  39. DL 0.165092
  40. WN 0.159664
  41. OO 0.046454
  42. NK 0.045115
  43. US 0.040462
  44. F9 0.030664
  45. AS 0.015931
  46. EV 0.015579
  47. VX 0.012125
  48. B6 0.003313
  49. MQ 0.002749
  50. (2000.0, inf] UA 0.289097
  51. AA 0.211643
  52. DL 0.171436
  53. B6 0.080414
  54. VX 0.073754
  55. US 0.065121
  56. WN 0.046374
  57. HA 0.027627
  58. NK 0.019240
  59. AS 0.011593
  60. F9 0.003700
  61. Name: AIRLINE, dtype: float64

更多

  1. # 求飞行时间的0.25,0.5,0.75分位数
  2. In[90]: flights.groupby(cuts)['AIR_TIME'].quantile(q=[.25, .5, .75]).div(60).round(2)
  3. Out[90]: DIST
  4. (-inf, 200.0] 0.25 0.43
  5. 0.50 0.50
  6. 0.75 0.57
  7. (200.0, 500.0] 0.25 0.77
  8. 0.50 0.92
  9. 0.75 1.05
  10. (500.0, 1000.0] 0.25 1.43
  11. 0.50 1.65
  12. 0.75 1.92
  13. (1000.0, 2000.0] 0.25 2.50
  14. 0.50 2.93
  15. 0.75 3.40
  16. (2000.0, inf] 0.25 4.30
  17. 0.50 4.70
  18. 0.75 5.03
  19. Name: AIR_TIME, dtype: float64
  1. # unstack方法可以将内层的索引变为列名
  2. In[91]: labels=['Under an Hour', '1 Hour', '1-2 Hours', '2-4 Hours', '4+ Hours']
  3. cuts2 = pd.cut(flights['DIST'], bins=bins, labels=labels)
  4. flights.groupby(cuts2)['AIRLINE'].value_counts(normalize=True).round(3).unstack().style.highlight_max(axis=1)
  5. Out[91]:

10. 用连续变量分组 - 图2