5. 用唯一和有序索引选取

  1. # 读取college数据集,使用STABBR作为行索引,检查行索引是否有序
  2. In[27]: college = pd.read_csv('data/college.csv')
  3. college2 = college.set_index('STABBR')
  4. In[28]: college2.index.is_monotonic
  5. Out[28]: False
  1. # 将college2排序,存储成另一个对象,查看其是否有序
  2. In[29]: college3 = college2.sort_index()
  3. college3.index.is_monotonic
  4. Out[29]: True
  1. # 从这三个DataFrame选取得克萨斯州,比较速度
  2. In[30]: %timeit college[college['STABBR'] == 'TX']
  3. 1.58 ms ± 63.8 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
  4. In[31]: %timeit college2.loc['TX']
  5. 622 µs ± 18.1 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
  6. In[32]: %timeit college3.loc['TX']
  7. 198 µs ± 5.8 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
  1. # 使用INSTNM作为行索引,检测行索引是否唯一
  2. In[33]: college_unique = college.set_index('INSTNM')
  3. college_unique.index.is_unique
  4. Out[33]: True
  1. # 用布尔索引选取斯坦福大学
  2. In[34]: college[college['INSTNM'] == 'Stanford University']
  3. Out[34]:

5. 用唯一和有序索引选取 - 图1

  1. # 用行索引标签选取斯坦福大学
  2. In[35]: college_unique.loc['Stanford University']
  3. Out[35]:
  4. CITY Stanford
  5. STABBR CA
  6. HBCU 0
  7. MENONLY 0
  8. WOMENONLY 0
  9. RELAFFIL 0
  10. SATVRMID 730
  11. SATMTMID 745
  12. DISTANCEONLY 0
  13. UGDS 7018
  14. UGDS_WHITE 0.3752
  15. UGDS_BLACK 0.0591
  16. UGDS_HISP 0.1607
  17. UGDS_ASIAN 0.1979
  18. UGDS_AIAN 0.0114
  19. UGDS_NHPI 0.0038
  20. UGDS_2MOR 0.1067
  21. UGDS_NRA 0.0819
  22. UGDS_UNKN 0.0031
  23. PPTUG_EF 0
  24. CURROPER 1
  25. PCTPELL 0.1556
  26. PCTFLOAN 0.1256
  27. UG25ABV 0.0401
  28. MD_EARN_WNE_P10 86000
  29. GRAD_DEBT_MDN_SUPP 12782
  30. Name: Stanford University, dtype: object
  1. # 比较两种方法的速度
  2. In[36]: %timeit college[college['INSTNM'] == 'Stanford University']
  3. 1.44 ms ± 66 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
  4. In[37]: %timeit college_unique.loc['Stanford University']
  5. 191 µs ± 5.31 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)

更多

  1. # 使用CITY和STABBR两列作为行索引,并进行排序
  2. In[38]: college.index = college['CITY'] + ', ' + college['STABBR']
  3. college = college.sort_index()
  4. college.head()
  5. Out[38]:

5. 用唯一和有序索引选取 - 图2

  1. # 选取所有Miami, FL的大学
  2. In[39]: college.loc['Miami, FL'].head()
  3. Out[39]:

5. 用唯一和有序索引选取 - 图3

  1. # 速度比较
  2. In[40]: %%timeit
  3. crit1 = college['CITY'] == 'Miami'
  4. crit2 = college['STABBR'] == 'FL'
  5. college[crit1 & crit2]
  6. 2.83 ms ± 82.4 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
  7. In[41]: %timeit college.loc['Miami, FL']
  8. 226 µs ± 17.3 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
  1. # 判断这两个条件是否相同
  2. In[42]: college[(college['CITY'] == 'Miami') & (college['STABBR'] == 'FL')].equals(college.loc['Miami, FL'])
  3. Out[42]: True