10. 用where方法保留Series
# 读取movie数据集,movie_title作为行索引,actor_1_facebook_likes列删除缺失值
In[68]: movie = pd.read_csv('data/movie.csv', index_col='movie_title')
fb_likes = movie['actor_1_facebook_likes'].dropna()
fb_likes.head()
Out[68]: movie_title
Avatar 1000.0
Pirates of the Caribbean: At World's End 40000.0
Spectre 11000.0
The Dark Knight Rises 27000.0
Star Wars: Episode VII - The Force Awakens 131.0
Name: actor_1_facebook_likes, dtype: float64
# 使用describe获得对数据的认知
In[69]: fb_likes.describe(percentiles=[.1, .25, .5, .75, .9]).astype(int)
Out[69]: count 4909
mean 6494
std 15106
min 0
10% 240
25% 607
50% 982
75% 11000
90% 18000
max 640000
Name: actor_1_facebook_likes, dtype: int64
# 作用和前面相同(这里是作者代码弄乱了)
In[70]: fb_likes.describe(percentiles=[.1,.25,.5,.75,.9])
Out[70]: count 4909.000000
mean 6494.488491
std 15106.986884
min 0.000000
10% 240.000000
25% 607.000000
50% 982.000000
75% 11000.000000
90% 18000.000000
max 640000.000000
Name: actor_1_facebook_likes, dtype: float64
# 画一张柱状图
In[71]: fb_likes.hist()
Out[71]: <matplotlib.axes._subplots.AxesSubplot at 0x10f9fbe80>
# 检测小于20000个喜欢的的比例
In[72]: criteria_high = fb_likes < 20000
criteria_high.mean().round(2)
Out[71]: 0.91000000000000003
# where条件可以返回一个同样大小的Series,但是所有False会被替换成缺失值
In[73]: fb_likes.where(criteria_high).head()
Out[73]: movie_title
Avatar 1000.0
Pirates of the Caribbean: At World's End NaN
Spectre 11000.0
The Dark Knight Rises NaN
Star Wars: Episode VII - The Force Awakens 131.0
Name: actor_1_facebook_likes, dtype: float64
# 第二个参数other,可以让你控制替换值
In[74]: fb_likes.where(criteria_high, other=20000).head()
Out[74]: movie_title
Avatar 1000.0
Pirates of the Caribbean: At World's End 20000.0
Spectre 11000.0
The Dark Knight Rises 20000.0
Star Wars: Episode VII - The Force Awakens 131.0
Name: actor_1_facebook_likes, dtype: float64
# 通过where条件,设定上下限的值
In[75]: criteria_low = fb_likes > 300
fb_likes_cap = fb_likes.where(criteria_high, other=20000)\
.where(criteria_low, 300)
fb_likes_cap.head()
Out[75]: movie_title
Avatar 1000.0
Pirates of the Caribbean: At World's End 20000.0
Spectre 11000.0
The Dark Knight Rises 20000.0
Star Wars: Episode VII - The Force Awakens 300.0
Name: actor_1_facebook_likes, dtype: float64
# 原始Series和修改过的Series的长度是一样的
In[76]: len(fb_likes), len(fb_likes_cap)
Out[76]: (4909, 4909)
# 再做一张柱状图,效果好多了
In[77]: fb_likes_cap.hist()
Out[77]: <matplotlib.axes._subplots.AxesSubplot at 0x10eeea8d0>
In[78]: fb_likes_cap2 = fb_likes.clip(lower=300, upper=20000)
fb_likes_cap2.equals(fb_likes_cap)
Out[78]: True