11. 对DataFrame的行做mask
# 读取movie,根据条件进行筛选
In[79]: movie = pd.read_csv('data/movie.csv', index_col='movie_title')
c1 = movie['title_year'] >= 2010
c2 = movie['title_year'].isnull()
criteria = c1 | c2
# 使用mask方法,使所有满足条件的数据消失
In[80]: movie.mask(criteria).head()
Out[80]:
# 去除缺失值
In[81]: movie_mask = movie.mask(criteria).dropna(how='all')
movie_mask.head()
Out[81]:
# 用布尔索引选取title_year小于2010的电影
In[82]: movie_boolean = movie[movie['title_year'] < 2010]
movie_boolean.head()
Out[82]:
# 判断这两种方法是否相同
In[83]: movie_mask.equals(movie_boolean)
Out[83]: False
# 判断二者的形状是否相同
In[84]: movie_mask.shape == movie_boolean.shape
Out[84]: True
# mask方法产生了许多缺失值,缺失值是float类型,所以之前是整数型的列都变成了浮点型
In[85]: movie_mask.dtypes == movie_boolean.dtypes
Out[85]:
color True
director_name True
num_critic_for_reviews True
duration True
director_facebook_likes True
actor_3_facebook_likes True
actor_2_name True
actor_1_facebook_likes True
gross True
genres True
actor_1_name True
num_voted_users False
cast_total_facebook_likes False
actor_3_name True
facenumber_in_poster True
plot_keywords True
movie_imdb_link True
num_user_for_reviews True
language True
country True
content_rating True
budget True
title_year True
actor_2_facebook_likes True
imdb_score True
aspect_ratio True
movie_facebook_likes False
dtype: bool
# Pandas有一个assert_frame_equal方法,可以判断两个Pandas对象是否一样,而不检测其数据类型
In[86]: from pandas.testing import assert_frame_equal
assert_frame_equal(movie_boolean, movie_mask, check_dtype=False)
更多
# 比较mask和布尔索引的速度,两者相差了一个数量级
In[87]: %timeit movie.mask(criteria).dropna(how='all')
11.1 ms ± 48.3 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
In[88]: %timeit movie[movie['title_year'] < 2010]
1.12 ms ± 36.7 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)