第11章用Matplotlib、Pandas、Seaborn进行可视化 - 4. 可视化flights数据集 - 《Pandas Cookbook 带注释源码》

4. 可视化flights数据集

#  读取flights数据集
 In[52]: flights = pd.read_csv('data/flights.csv')
         flights.head()
Out[52]:

#  创建两列，表示延迟和准时
 In[53]: flights['DELAYED'] = flights['ARR_DELAY'].ge(15).astype(int)
         cols = ['DIVERTED', 'CANCELLED', 'DELAYED']
         flights['ON_TIME'] = 1 - flights[cols].any(axis=1)
         cols.append('ON_TIME')
         status = flights[cols].sum()
         status
Out[53]: DIVERTED       137
         CANCELLED      881
         DELAYED      11685
         ON_TIME      45789
         dtype: int64

#  对类型值和连续值列作图
 In[54]: fig, ax_array = plt.subplots(2, 3, figsize=(18,8))
         (ax1, ax2, ax3), (ax4, ax5, ax6) = ax_array
         fig.suptitle('2015 US Flights - Univariate Summary', size=20)
         ac = flights['AIRLINE'].value_counts()
         ac.plot(kind='barh', ax=ax1, title ='Airline')
         oc = flights['ORG_AIR'].value_counts()
         oc.plot(kind='bar', ax=ax2, rot=0, title='Origin City')
         dc = flights['DEST_AIR'].value_counts().head(10)
         dc.plot(kind='bar', ax=ax3, rot=0, title='Destination City')
         status.plot(kind='bar', ax=ax4, rot=0, log=True, title='Flight Status')
         flights['DIST'].plot(kind='kde', ax=ax5, xlim=(0, 3000),
                              title='Distance KDE')
         flights['ARR_DELAY'].plot(kind='hist', ax=ax6, 
                                   title='Arrival Delay', range=(0,200))
Out[54]: <matplotlib.axes._subplots.AxesSubplot at 0x11a67e3c8>

#  添加关于年的列，用起飞时间得到小时和分钟
 In[55]: hour = flights['SCHED_DEP'] // 100
         minute = flights['SCHED_DEP'] % 100
         df_date = flights[['MONTH', 'DAY']].assign(YEAR=2015, HOUR=hour, MINUTE=minute)
         df_date.head()
Out[55]:

#  用to_datetime函数，将df_date变为Timestamps对象
 In[56]: flight_dep = pd.to_datetime(df_date)
         flight_dep.head()
Out[56]: 0   2015-01-01 16:25:00
         1   2015-01-01 08:23:00
         2   2015-01-01 13:05:00
         3   2015-01-01 15:55:00
         4   2015-01-01 17:20:00
         dtype: datetime64[ns]

#  用flight_dep作为新的行索引，并根据它统计每周的航班数
 In[57]: flights.index = flight_dep
         fc = flights.resample('W').size()
         fc.plot(figsize=(12,3), title='Flights per Week', grid=True)
Out[57]: <matplotlib.axes._subplots.AxesSubplot at 0x109d116d8>

#  如果航班数小于1000，则将其当做缺失值。然后用interpolate方法填补缺失值
 In[58]: fc_miss = fc.where(fc >  1000)
         fc_intp = fc_miss.interpolate(limit_direction='both')
         ax = fc_intp.plot(color='black', figsize=(16,4))
         fc_intp[fc < 500].plot(linewidth=10, grid=True, 
                                color='.8', ax=ax)
         ax.annotate(xy=(.8, .55), xytext=(.8, .77), 
                     xycoords='axes fraction', s='missing data', 
                     ha='center',  size=20, arrowprops=dict())
         ax.set_title('Flights per Week (Interpolated Missing Data)')
Out[58]: Text(0.5,1,'Flights per Week (Interpolated Missing Data)')

#  找到10个有最长平均入境航班航程、最少100航次的机场
 In[59]: flights.groupby('DEST_AIR')['DIST'] \
                .agg(['mean', 'count']) \
                .query('count > 100') \
                .sort_values('mean') \
                .tail(10) \
                .plot(kind='bar', y='mean', legend=False, 
                      rot=0, figsize=(14,4),
                      title='Average Distance per Destination')
Out[59]: <matplotlib.axes._subplots.AxesSubplot at 0x11a480dd8>

#  画出航班时间和航程的散点图
 In[60]: fs = flights.reset_index(drop=True)[['DIST', 'AIR_TIME']].query('DIST <= 2000').dropna()
         fs.plot(x='DIST', y='AIR_TIME', kind='scatter', s=1, figsize=(16,4))
Out[60]: <matplotlib.axes._subplots.AxesSubplot at 0x11a49b860>

#  用cut函数，将航班距离分成八组
 In[61]: fs['DIST_GROUP'] = pd.cut(fs['DIST'], bins=range(0, 2001, 250))
         fs['DIST_GROUP'].value_counts().sort_index()
Out[61]: (0, 250]         6529
         (250, 500]      12631
         (500, 750]      11506
         (750, 1000]      8832
         (1000, 1250]     5071
         (1250, 1500]     3198
         (1500, 1750]     3885
         (1750, 2000]     1815
         Name: DIST_GROUP, dtype: int64

#  计算每组的标准差
 In[62]: normalize = lambda x: (x - x.mean()) / x.std()
         fs['TIME_SCORE'] = fs.groupby('DIST_GROUP')['AIR_TIME'] \
                              .transform(normalize)
         fs.head()
Out[62]:

#  用boxplot方法画出异常值
 In[63]: ax = fs.boxplot(by='DIST_GROUP', column='TIME_SCORE', figsize=(16,4))
         ax.set_title('Z-Scores for Distance Groups')
         ax.figure.suptitle('')
/Users/Ted/anaconda/lib/python3.6/site-packages/numpy/core/fromnumeric.py:57: FutureWarning: reshape is deprecated and will raise in a subsequent release. Please use .values.reshape(...) instead
  return getattr(obj, method)(*args, **kwds)
Out[63]: Text(0.5,0.98,'')

#  检查超出6个标准偏差的点。用一个DataFrame记录异常点。
 In[64]: outliers = flights.iloc[fs[fs['TIME_SCORE'] > 6].index]
         outliers = outliers[['AIRLINE','ORG_AIR', 'DEST_AIR', 'AIR_TIME', 
                              'DIST', 'ARR_DELAY', 'DIVERTED']]
         outliers['PLOT_NUM'] = range(1, len(outliers) + 1)
         outliers
Out[64]:

#  可以这张表的数据确定异常值。pandas提供了将表格附加于图片底部的方法。
 In[65]: ax = fs.plot(x='DIST', y='AIR_TIME', 
                      kind='scatter', s=1, 
                      figsize=(16,4), table=outliers)
         outliers.plot(x='DIST', y='AIR_TIME',
                       kind='scatter', s=25, ax=ax, grid=True)
         outs = outliers[['AIR_TIME', 'DIST', 'PLOT_NUM']]
         for t, d, n  in outs.itertuples(index=False):
             ax.text(d + 5, t + 5, str(n))
         plt.setp(ax.get_xticklabels(), y=.1)
         plt.setp(ax.get_xticklines(), visible=False)
         ax.set_xlabel('')
         ax.set_title('Flight Time vs Distance with Outliers')
Out[65]: Text(0.5,1,'Flight Time vs Distance with Outliers')