4. 可视化flights数据集
# 读取flights数据集
In[52]: flights = pd.read_csv('data/flights.csv')
flights.head()
Out[52]:
# 创建两列,表示延迟和准时
In[53]: flights['DELAYED'] = flights['ARR_DELAY'].ge(15).astype(int)
cols = ['DIVERTED', 'CANCELLED', 'DELAYED']
flights['ON_TIME'] = 1 - flights[cols].any(axis=1)
cols.append('ON_TIME')
status = flights[cols].sum()
status
Out[53]: DIVERTED 137
CANCELLED 881
DELAYED 11685
ON_TIME 45789
dtype: int64
# 对类型值和连续值列作图
In[54]: fig, ax_array = plt.subplots(2, 3, figsize=(18,8))
(ax1, ax2, ax3), (ax4, ax5, ax6) = ax_array
fig.suptitle('2015 US Flights - Univariate Summary', size=20)
ac = flights['AIRLINE'].value_counts()
ac.plot(kind='barh', ax=ax1, title ='Airline')
oc = flights['ORG_AIR'].value_counts()
oc.plot(kind='bar', ax=ax2, rot=0, title='Origin City')
dc = flights['DEST_AIR'].value_counts().head(10)
dc.plot(kind='bar', ax=ax3, rot=0, title='Destination City')
status.plot(kind='bar', ax=ax4, rot=0, log=True, title='Flight Status')
flights['DIST'].plot(kind='kde', ax=ax5, xlim=(0, 3000),
title='Distance KDE')
flights['ARR_DELAY'].plot(kind='hist', ax=ax6,
title='Arrival Delay', range=(0,200))
Out[54]: <matplotlib.axes._subplots.AxesSubplot at 0x11a67e3c8>
# 添加关于年的列,用起飞时间得到小时和分钟
In[55]: hour = flights['SCHED_DEP'] // 100
minute = flights['SCHED_DEP'] % 100
df_date = flights[['MONTH', 'DAY']].assign(YEAR=2015, HOUR=hour, MINUTE=minute)
df_date.head()
Out[55]:
# 用to_datetime函数,将df_date变为Timestamps对象
In[56]: flight_dep = pd.to_datetime(df_date)
flight_dep.head()
Out[56]: 0 2015-01-01 16:25:00
1 2015-01-01 08:23:00
2 2015-01-01 13:05:00
3 2015-01-01 15:55:00
4 2015-01-01 17:20:00
dtype: datetime64[ns]
# 用flight_dep作为新的行索引,并根据它统计每周的航班数
In[57]: flights.index = flight_dep
fc = flights.resample('W').size()
fc.plot(figsize=(12,3), title='Flights per Week', grid=True)
Out[57]: <matplotlib.axes._subplots.AxesSubplot at 0x109d116d8>
# 如果航班数小于1000,则将其当做缺失值。然后用interpolate方法填补缺失值
In[58]: fc_miss = fc.where(fc > 1000)
fc_intp = fc_miss.interpolate(limit_direction='both')
ax = fc_intp.plot(color='black', figsize=(16,4))
fc_intp[fc < 500].plot(linewidth=10, grid=True,
color='.8', ax=ax)
ax.annotate(xy=(.8, .55), xytext=(.8, .77),
xycoords='axes fraction', s='missing data',
ha='center', size=20, arrowprops=dict())
ax.set_title('Flights per Week (Interpolated Missing Data)')
Out[58]: Text(0.5,1,'Flights per Week (Interpolated Missing Data)')
# 找到10个有最长平均入境航班航程、最少100航次的机场
In[59]: flights.groupby('DEST_AIR')['DIST'] \
.agg(['mean', 'count']) \
.query('count > 100') \
.sort_values('mean') \
.tail(10) \
.plot(kind='bar', y='mean', legend=False,
rot=0, figsize=(14,4),
title='Average Distance per Destination')
Out[59]: <matplotlib.axes._subplots.AxesSubplot at 0x11a480dd8>
# 画出航班时间和航程的散点图
In[60]: fs = flights.reset_index(drop=True)[['DIST', 'AIR_TIME']].query('DIST <= 2000').dropna()
fs.plot(x='DIST', y='AIR_TIME', kind='scatter', s=1, figsize=(16,4))
Out[60]: <matplotlib.axes._subplots.AxesSubplot at 0x11a49b860>
# 用cut函数,将航班距离分成八组
In[61]: fs['DIST_GROUP'] = pd.cut(fs['DIST'], bins=range(0, 2001, 250))
fs['DIST_GROUP'].value_counts().sort_index()
Out[61]: (0, 250] 6529
(250, 500] 12631
(500, 750] 11506
(750, 1000] 8832
(1000, 1250] 5071
(1250, 1500] 3198
(1500, 1750] 3885
(1750, 2000] 1815
Name: DIST_GROUP, dtype: int64
# 计算每组的标准差
In[62]: normalize = lambda x: (x - x.mean()) / x.std()
fs['TIME_SCORE'] = fs.groupby('DIST_GROUP')['AIR_TIME'] \
.transform(normalize)
fs.head()
Out[62]:
# 用boxplot方法画出异常值
In[63]: ax = fs.boxplot(by='DIST_GROUP', column='TIME_SCORE', figsize=(16,4))
ax.set_title('Z-Scores for Distance Groups')
ax.figure.suptitle('')
/Users/Ted/anaconda/lib/python3.6/site-packages/numpy/core/fromnumeric.py:57: FutureWarning: reshape is deprecated and will raise in a subsequent release. Please use .values.reshape(...) instead
return getattr(obj, method)(*args, **kwds)
Out[63]: Text(0.5,0.98,'')
# 检查超出6个标准偏差的点。用一个DataFrame记录异常点。
In[64]: outliers = flights.iloc[fs[fs['TIME_SCORE'] > 6].index]
outliers = outliers[['AIRLINE','ORG_AIR', 'DEST_AIR', 'AIR_TIME',
'DIST', 'ARR_DELAY', 'DIVERTED']]
outliers['PLOT_NUM'] = range(1, len(outliers) + 1)
outliers
Out[64]:
# 可以这张表的数据确定异常值。pandas提供了将表格附加于图片底部的方法。
In[65]: ax = fs.plot(x='DIST', y='AIR_TIME',
kind='scatter', s=1,
figsize=(16,4), table=outliers)
outliers.plot(x='DIST', y='AIR_TIME',
kind='scatter', s=25, ax=ax, grid=True)
outs = outliers[['AIR_TIME', 'DIST', 'PLOT_NUM']]
for t, d, n in outs.itertuples(index=False):
ax.text(d + 5, t + 5, str(n))
plt.setp(ax.get_xticklabels(), y=.1)
plt.setp(ax.get_xticklines(), visible=False)
ax.set_xlabel('')
ax.set_title('Flight Time vs Distance with Outliers')
Out[65]: Text(0.5,1,'Flight Time vs Distance with Outliers')