3. 在整个DataFrame上操作

  1. In[18]: pd.options.display.max_rows = 8
  2. movie = pd.read_csv('data/movie.csv')
  3. # 打印行数和列数
  4. movie.shape
  5. Out[18]: (4916, 28)
  1. # 打印数据的个数
  2. In[19]: movie.size
  3. Out[19]: 137648
  1. # 该数据集的维度
  2. In[20]: movie.ndim
  3. Out[20]: 2
  1. # 该数据集的长度
  2. In[21]: len(movie)
  3. Out[21]: 4916
  1. # 各个列的值的个数
  2. In[22]: movie.count()
  3. Out[22]: color 4897
  4. director_name 4814
  5. num_critic_for_reviews 4867
  6. duration 4901
  7. ...
  8. actor_2_facebook_likes 4903
  9. imdb_score 4916
  10. aspect_ratio 4590
  11. movie_facebook_likes 4916
  12. Length: 28, dtype: int64
  1. # 各列的最小值
  2. In[23]: movie.min()
  3. Out[23]: num_critic_for_reviews 1.00
  4. duration 7.00
  5. director_facebook_likes 0.00
  6. actor_3_facebook_likes 0.00
  7. ...
  8. actor_2_facebook_likes 0.00
  9. imdb_score 1.60
  10. aspect_ratio 1.18
  11. movie_facebook_likes 0.00
  12. Length: 16, dtype: float64
  1. # 打印描述信息
  2. In[24]: movie.describe()
  3. Out[24]:

3. 在整个DataFrame上操作 - 图1

  1. # 使用percentiles参数指定分位数
  2. In[25]: pd.options.display.max_rows = 10
  3. In[26]: movie.describe(percentiles=[.01, .3, .99])
  4. Out[26]:

3. 在整个DataFrame上操作 - 图2

  1. # 打印各列空值的个数
  2. In[27]: pd.options.display.max_rows = 8
  3. In[28]: movie.isnull().sum()
  4. Out[28]: color 19
  5. director_name 102
  6. num_critic_for_reviews 49
  7. duration 15
  8. ...
  9. actor_2_facebook_likes 13
  10. imdb_score 0
  11. aspect_ratio 326
  12. movie_facebook_likes 0
  13. Length: 28, dtype: int64

更多

  1. # 设定skipna=False,没有缺失值的数值列才会计算结果
  2. In[29]: movie.min(skipna=False)
  3. Out[29]: num_critic_for_reviews NaN
  4. duration NaN
  5. director_facebook_likes NaN
  6. actor_3_facebook_likes NaN
  7. ...
  8. actor_2_facebook_likes NaN
  9. imdb_score 1.6
  10. aspect_ratio NaN
  11. movie_facebook_likes 0.0
  12. Length: 16, dtype: float64