5. 用唯一和有序索引选取
# 读取college数据集,使用STABBR作为行索引,检查行索引是否有序
In[27]: college = pd.read_csv('data/college.csv')
college2 = college.set_index('STABBR')
In[28]: college2.index.is_monotonic
Out[28]: False
# 将college2排序,存储成另一个对象,查看其是否有序
In[29]: college3 = college2.sort_index()
college3.index.is_monotonic
Out[29]: True
# 从这三个DataFrame选取得克萨斯州,比较速度
In[30]: %timeit college[college['STABBR'] == 'TX']
1.58 ms ± 63.8 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
In[31]: %timeit college2.loc['TX']
622 µs ± 18.1 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
In[32]: %timeit college3.loc['TX']
198 µs ± 5.8 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
# 使用INSTNM作为行索引,检测行索引是否唯一
In[33]: college_unique = college.set_index('INSTNM')
college_unique.index.is_unique
Out[33]: True
# 用布尔索引选取斯坦福大学
In[34]: college[college['INSTNM'] == 'Stanford University']
Out[34]:
# 用行索引标签选取斯坦福大学
In[35]: college_unique.loc['Stanford University']
Out[35]:
CITY Stanford
STABBR CA
HBCU 0
MENONLY 0
WOMENONLY 0
RELAFFIL 0
SATVRMID 730
SATMTMID 745
DISTANCEONLY 0
UGDS 7018
UGDS_WHITE 0.3752
UGDS_BLACK 0.0591
UGDS_HISP 0.1607
UGDS_ASIAN 0.1979
UGDS_AIAN 0.0114
UGDS_NHPI 0.0038
UGDS_2MOR 0.1067
UGDS_NRA 0.0819
UGDS_UNKN 0.0031
PPTUG_EF 0
CURROPER 1
PCTPELL 0.1556
PCTFLOAN 0.1256
UG25ABV 0.0401
MD_EARN_WNE_P10 86000
GRAD_DEBT_MDN_SUPP 12782
Name: Stanford University, dtype: object
# 比较两种方法的速度
In[36]: %timeit college[college['INSTNM'] == 'Stanford University']
1.44 ms ± 66 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
In[37]: %timeit college_unique.loc['Stanford University']
191 µs ± 5.31 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
更多
# 使用CITY和STABBR两列作为行索引,并进行排序
In[38]: college.index = college['CITY'] + ', ' + college['STABBR']
college = college.sort_index()
college.head()
Out[38]:
# 选取所有Miami, FL的大学
In[39]: college.loc['Miami, FL'].head()
Out[39]:
# 速度比较
In[40]: %%timeit
crit1 = college['CITY'] == 'Miami'
crit2 = college['STABBR'] == 'FL'
college[crit1 & crit2]
2.83 ms ± 82.4 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
In[41]: %timeit college.loc['Miami, FL']
226 µs ± 17.3 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
# 判断这两个条件是否相同
In[42]: college[(college['CITY'] == 'Miami') & (college['STABBR'] == 'FL')].equals(college.loc['Miami, FL'])
Out[42]: True