10. 用连续变量分组
In[83]: flights = pd.read_csv('data/flights.csv')
flights.head()
Out[83]:
# 判断DIST列有无缺失值
In[84]: flights.DIST.hasnans
Out[84]: False
# 再次删除DIST列的缺失值(原书是没有这两段的)
In[85]: flights.dropna(subset=['DIST']).shape
Out[85]: (58492, 14)
# 使用Pandas的cut函数,将数据分成5个面元
In[86]: bins = [-np.inf, 200, 500, 1000, 2000, np.inf]
cuts = pd.cut(flights['DIST'], bins=bins)
cuts.head()
Out[86]: 0 (500.0, 1000.0]
1 (1000.0, 2000.0]
2 (500.0, 1000.0]
3 (1000.0, 2000.0]
4 (1000.0, 2000.0]
Name: DIST, dtype: category
Categories (5, interval[float64]): [(-inf, 200.0] < (200.0, 500.0] < (500.0, 1000.0] < (1000.0, 2000.0] < (2000.0, inf]]
# 对每个面元进行统计
In[87]: cuts.value_counts()
Out[87]: (500.0, 1000.0] 20659
(200.0, 500.0] 15874
(1000.0, 2000.0] 14186
(2000.0, inf] 4054
(-inf, 200.0] 3719
Name: DIST, dtype: int64
# 面元Series可以用来进行分组
In[88]: flights.groupby(cuts)['AIRLINE'].value_counts(normalize=True).round(3).head(15)
Out[88]: DIST AIRLINE
(-inf, 200.0] OO 0.326
EV 0.289
MQ 0.211
DL 0.086
AA 0.052
UA 0.027
WN 0.009
(200.0, 500.0] WN 0.194
DL 0.189
OO 0.159
EV 0.156
MQ 0.100
AA 0.071
UA 0.062
VX 0.028
Name: AIRLINE, dtype: float64
原理
In[89]: flights.groupby(cuts)['AIRLINE'].value_counts(normalize=True)['AIRLINE'].value_counts(normalize=True).round(3).head(15)
Out[89]:
DIST AIRLINE
(-inf, 200.0] OO 0.325625
EV 0.289325
MQ 0.210809
DL 0.086045
AA 0.052165
UA 0.027427
WN 0.008604
(200.0, 500.0] WN 0.193902
DL 0.188736
OO 0.158687
EV 0.156293
MQ 0.100164
AA 0.071375
UA 0.062051
VX 0.028222
US 0.016001
NK 0.011843
B6 0.006867
F9 0.004914
AS 0.000945
(500.0, 1000.0] DL 0.205625
AA 0.143908
WN 0.138196
UA 0.131129
OO 0.106443
EV 0.100683
MQ 0.051213
F9 0.038192
NK 0.029527
US 0.025316
AS 0.023234
VX 0.003582
B6 0.002953
(1000.0, 2000.0] AA 0.263781
UA 0.199070
DL 0.165092
WN 0.159664
OO 0.046454
NK 0.045115
US 0.040462
F9 0.030664
AS 0.015931
EV 0.015579
VX 0.012125
B6 0.003313
MQ 0.002749
(2000.0, inf] UA 0.289097
AA 0.211643
DL 0.171436
B6 0.080414
VX 0.073754
US 0.065121
WN 0.046374
HA 0.027627
NK 0.019240
AS 0.011593
F9 0.003700
Name: AIRLINE, dtype: float64
更多
# 求飞行时间的0.25,0.5,0.75分位数
In[90]: flights.groupby(cuts)['AIR_TIME'].quantile(q=[.25, .5, .75]).div(60).round(2)
Out[90]: DIST
(-inf, 200.0] 0.25 0.43
0.50 0.50
0.75 0.57
(200.0, 500.0] 0.25 0.77
0.50 0.92
0.75 1.05
(500.0, 1000.0] 0.25 1.43
0.50 1.65
0.75 1.92
(1000.0, 2000.0] 0.25 2.50
0.50 2.93
0.75 3.40
(2000.0, inf] 0.25 4.30
0.50 4.70
0.75 5.03
Name: AIR_TIME, dtype: float64
# unstack方法可以将内层的索引变为列名
In[91]: labels=['Under an Hour', '1 Hour', '1-2 Hours', '2-4 Hours', '4+ Hours']
cuts2 = pd.cut(flights['DIST'], bins=bins, labels=labels)
flights.groupby(cuts2)['AIRLINE'].value_counts(normalize=True).round(3).unstack().style.highlight_max(axis=1)
Out[91]: