2. 用matplotlib做数据可视化
# 读取movie数据集,计算每年的预算中位数,再计算五年滚动均值以平滑数据
In[32]: movie = pd.read_csv('data/movie.csv')
med_budget = movie.groupby('title_year')['budget'].median() / 1e6
med_budget_roll = med_budget.rolling(5, min_periods=1).mean()
med_budget_roll.tail()
Out[32]: title_year
2012.0 20.893
2013.0 19.893
2014.0 19.100
2015.0 17.980
2016.0 17.780
Name: budget, dtype: float64
# 将数据变为NumPy数组
In[33]: years = med_budget_roll.index.values
years[-5:]
Out[33]: array([ 2012., 2013., 2014., 2015., 2016.])
In[34]: budget = med_budget_roll.values
budget[-5:]
Out[34]: array([ 20.893, 19.893, 19.1 , 17.98 , 17.78 ])
# plot方法可以用来画线图
In[35]: fig, ax = plt.subplots(figsize=(14,4), linewidth=5, edgecolor='.5')
ax.plot(years, budget, linestyle='--', linewidth=3, color='.2', label='All Movies')
text_kwargs=dict(fontsize=20, family='cursive')
ax.set_title('Median Movie Budget', **text_kwargs)
ax.set_ylabel('Millions of Dollars', **text_kwargs)
Out[35]: Text(0,0.5,'Millions of Dollars')
# 每年的电影产量
In[36]: movie_count = movie.groupby('title_year')['budget'].count()
movie_count.tail()
Out[36]: title_year
2012.0 191
2013.0 208
2014.0 221
2015.0 192
2016.0 86
Name: budget, dtype: int64
# 在前图的基础上,将每年的电影产量画成一个柱状图,因为大部分电影都近年的,所以将起始的年设为1970
In[37]: ct = movie_count.values
ct_norm = ct / ct.max() * budget.max()
fifth_year = (years % 5 == 0) & (years >= 1970)
years_5 = years[fifth_year]
ct_5 = ct[fifth_year]
ct_norm_5 = ct_norm[fifth_year]
ax.bar(years_5, ct_norm_5, 3, facecolor='.5', alpha=.3, label='Movies per Year')
ax.set_xlim(1968, 2017)
for x, y, v in zip(years_5, ct_norm_5, ct_5):
ax.text(x, y + .5, str(v), ha='center')
ax.legend()
fig
Out[37]:
# 找到每年的前10部电影的五年滚动中位数
In[38]: top10 = movie.sort_values('budget', ascending=False) \
.groupby('title_year')['budget'] \
.apply(lambda x: x.iloc[:10].median() / 1e6)
top10_roll = top10.rolling(5, min_periods=1).mean()
top10_roll.tail()
Out[38]: title_year
2012.0 192.9
2013.0 195.9
2014.0 191.7
2015.0 186.8
2016.0 189.1
Name: budget, dtype: float64
# 将上面的数据画到另一张子图中
In[39]: fig2, ax_array = plt.subplots(2, 1, figsize=(14,6), sharex=True)
ax1 = ax_array[0]
ax2 = ax_array[1]
ax1.plot(years, budget, linestyle='--', linewidth=3, color='.2', label='All Movies')
ax1.bar(years_5, ct_norm_5, 3, facecolor='.5', alpha=.3, label='Movies per Year')
ax1.legend(loc='upper left')
ax1.set_xlim(1968, 2017)
plt.setp(ax1.get_xticklines(), visible=False)
for x, y, v in zip(years_5, ct_norm_5, ct_5):
ax1.text(x, y + .5, str(v), ha='center')
ax2.plot(years, top10_roll.values, color='.2', label='Top 10 Movies')
ax2.legend(loc='upper left')
fig2.tight_layout()
fig2.suptitle('Median Movie Budget', y=1.02, **text_kwargs)
fig2.text(0, .6, 'Millions of Dollars', rotation='vertical', ha='center', **text_kwargs)
import os
path = os.path.expanduser('~/Desktop/movie_budget.png')
fig2.savefig(path, bbox_inches='tight')
Out[39]:
原理
In[40]: med_budget_roll.tail()
Out[40]: title_year
2012.0 20.893
2013.0 19.893
2014.0 19.100
2015.0 17.980
2016.0 17.780
Name: budget, dtype: float64
# 手动确认一下rolling方法
In[41]: med_budget.loc[2012:2016].mean()
Out[41]: 17.78
In[42]: med_budget.loc[2011:2015].mean()
Out[42]: 17.98
In[43]: med_budget.loc[2010:2014].mean()
Out[43]: 19.1
# 必须使用expanduser创建完整路径
In[44]: os.path.expanduser('~/Desktop/movie_budget.png')
Out[44]: '/Users/Ted/Desktop/movie_budget.png'
更多
In[45]: cols = ['budget', 'title_year', 'imdb_score', 'movie_title']
m = movie[cols].dropna()
# m = movie[['budget', 'title_year', 'imdb_score', 'movie_title']].dropna()
m['budget2'] = m['budget'] / 1e6
np.random.seed(0)
movie_samp = m.query('title_year >= 2000').sample(100)
fig, ax = plt.subplots(figsize=(14,6))
ax.scatter(x='title_year', y='imdb_score', s='budget2', data=movie_samp)
idx_min = movie_samp['imdb_score'].idxmin()
idx_max = movie_samp['imdb_score'].idxmax()
for idx, offset in zip([idx_min, idx_max], [.5, -.5]):
year = movie_samp.loc[idx, 'title_year']
score = movie_samp.loc[idx, 'imdb_score']
title = movie_samp.loc[idx, 'movie_title']
ax.annotate(xy=(year, score),
xytext=(year + 1, score + offset),
s=title + ' ({})'.format(score),
ha='center',
size=16,
arrowprops=dict(arrowstyle="fancy"))
ax.set_title('IMDB Score by Year', size=25)
ax.grid(True)
Out[45]: