12. 使用布尔值、整数、标签进行选取
# 读取movie,根据布尔条件选取
In[89]: movie = pd.read_csv('data/movie.csv', index_col='movie_title')
c1 = movie['content_rating'] == 'G'
c2 = movie['imdb_score'] < 4
criteria = c1 & c2
In[90]: movie_loc = movie.loc[criteria]
movie_loc.head()
Out[90]:
# 检查loc条件和布尔条件创建出来的两个DataFrame是否一样
In[91]: movie_loc.equals(movie[criteria])
Out[91]: True
# 尝试用.iloc使用布尔索引
In[92]: movie_iloc = movie.iloc[criteria]
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-92-24a12062c6c3> in <module>()
----> 1 movie_iloc = movie.iloc[criteria]
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/indexing.py in __getitem__(self, key)
1326 else:
1327 key = com._apply_if_callable(key, self.obj)
-> 1328 return self._getitem_axis(key, axis=0)
1329
1330 def _is_scalar_access(self, key):
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/indexing.py in _getitem_axis(self, key, axis)
1731
1732 if is_bool_indexer(key):
-> 1733 self._has_valid_type(key, axis)
1734 return self._getbool_axis(key, axis=axis)
1735
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/indexing.py in _has_valid_type(self, key, axis)
1588 "indexing on an integer type "
1589 "is not available")
-> 1590 raise ValueError("iLocation based boolean indexing cannot use "
1591 "an indexable as a mask")
1592 return True
ValueError: iLocation based boolean indexing cannot use an indexable as a mask
# 但是,却可以使用布尔值得ndarray,用values可以取出array
In[93]: movie_iloc = movie.iloc[criteria.values]
In[94]: movie_iloc.equals(movie_loc)
Out[94]: True
In[95]: movie.loc[criteria.values]
Out[95]:
# 布尔索引也可以用来选取列
In[96]: criteria_col = movie.dtypes == np.int64
criteria_col.head()
Out[96]: color False
director_name False
num_critic_for_reviews False
duration False
director_facebook_likes False
dtype: bool
In[97]: movie.loc[:, criteria_col].head()
Out[97]:
# 因为criteria_col是包含行索引的一个Series,必须要使用底层的ndarray,才能使用,iloc
In[98]: movie.iloc[:, criteria_col.values].head()
Out[98]:
# 选取'content_rating', 'imdb_score', 'title_year', 'gross'四列,按照imdb_score升序排列
In[99]: cols = ['content_rating', 'imdb_score', 'title_year', 'gross']
movie.loc[criteria, cols].sort_values('imdb_score')
Out[99]:
# 用get_loc获取这四列的整数位置
In[100]: col_index = [movie.columns.get_loc(col) for col in cols]
col_index
Out[100]: [20, 24, 22, 8]
# 这时候就可以使用iloc了
In[101]: movie.iloc[criteria.values, col_index].sort_values('imdb_score')
Out[101]:
原理
# 查看Series的底层结构
In[102]: a = criteria.values
a[:5]
Out[102]: array([False, False, False, False, False], dtype=bool)
In[103]: len(a), len(criteria)
Out[103]: (4916, 4916)
更多
# 传入的布尔索引可以跟要操作的DataFrame长度不同
In[104]: movie.loc[[True, False, True], [True, False, False, True]]
Out[104]: