11. 计算城市之间的航班总数
In[92]: flights = pd.read_csv('data/flights.csv')
flights.head()
Out[92]:
# 求每两个城市间的航班总数
In[93]: flights_ct = flights.groupby(['ORG_AIR', 'DEST_AIR']).size()
flights_ct.head()
Out[93]: ORG_AIR DEST_AIR
ATL ABE 31
ABQ 16
ABY 19
ACY 6
AEX 40
dtype: int64
# 选出休斯顿(IAH)和亚特兰大(ATL)之间双方向的航班总数
In[94]: flights_ct.loc[[('ATL', 'IAH'), ('IAH', 'ATL')]]
Out[94]: ORG_AIR DEST_AIR
ATL IAH 121
IAH ATL 148
dtype: int64
# 分别对每行按照出发地和目的地,按字母排序
In[95]: flights_sort = flights[['ORG_AIR', 'DEST_AIR']].apply(sorted, axis=1)
flights_sort.head()
Out[95]:
# 因为现在每行都是独立排序的,列名存在问题。对列重命名,然后再计算所有城市间的航班数
In[96]: rename_dict = {'ORG_AIR':'AIR1','DEST_AIR':'AIR2'}
flights_sort = flights_sort.rename(columns=rename_dict)
flights_ct2 = flights_sort.groupby(['AIR1', 'AIR2']).size()
flights_ct2.head()
Out[96]: AIR1 AIR2
ABE ATL 31
ORD 24
ABI DFW 74
ABQ ATL 16
DEN 46
dtype: int64
# 找到亚特兰大和休斯顿之间的航班数
In[97]: flights_ct2.loc[('ATL', 'IAH')]
Out[97]: 269
# 如果调换顺序,则会出错
In[98]: flights_ct2.loc[('IAH', 'ATL')]
---------------------------------------------------------------------------
IndexingError Traceback (most recent call last)
<ipython-input-98-56147a7d0bb5> in <module>()
----> 1 flights_ct2.loc[('IAH', 'ATL')]
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/indexing.py in __getitem__(self, key)
1323 except (KeyError, IndexError):
1324 pass
-> 1325 return self._getitem_tuple(key)
1326 else:
1327 key = com._apply_if_callable(key, self.obj)
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/indexing.py in _getitem_tuple(self, tup)
839
840 # no multi-index, so validate all of the indexers
--> 841 self._has_valid_tuple(tup)
842
843 # ugly hack for GH # 836
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/indexing.py in _has_valid_tuple(self, key)
186 for i, k in enumerate(key):
187 if i >= self.obj.ndim:
--> 188 raise IndexingError('Too many indexers')
189 if not self._has_valid_type(k, i):
190 raise ValueError("Location based indexing can only have [%s] "
IndexingError: Too many indexers
更多
# 用NumPy的sort函数可以大大提高速度
In[99]: data_sorted = np.sort(flights[['ORG_AIR', 'DEST_AIR']])
data_sorted[:10]
Out[99]: array([['LAX', 'SLC'],
['DEN', 'IAD'],
['DFW', 'VPS'],
['DCA', 'DFW'],
['LAX', 'MCI'],
['IAH', 'SAN'],
['DFW', 'MSY'],
['PHX', 'SFO'],
['ORD', 'STL'],
['IAH', 'SJC']], dtype=object)
# 重新用DataFrame构造器创建一个DataFrame,检测其是否与flights_sorted相等
In[100]: flights_sort2 = pd.DataFrame(data_sorted, columns=['AIR1', 'AIR2'])
fs_orig = flights_sort.rename(columns={'ORG_AIR':'AIR1', 'DEST_AIR':'AIR2'})
flights_sort2.equals(fs_orig)
Out[100]: True
# 比较速度
In[101]: %timeit flights_sort = flights[['ORG_AIR', 'DEST_AIR']].apply(sorted, axis=1)
7.82 s ± 189 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
In[102]: %%timeit
data_sorted = np.sort(flights[['ORG_AIR', 'DEST_AIR']])
flights_sort2 = pd.DataFrame(data_sorted, columns=['AIR1', 'AIR2'])
10.9 ms ± 325 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)