3. 比较特朗普和奥巴马的支持率
# pandas的read_html函数可以从网页抓取表格数据
In[31]: base_url = 'http://www.presidency.ucsb.edu/data/popularity.php?pres={}'
trump_url = base_url.format(45)
df_list = pd.read_html(trump_url)
len(df_list)
Out[31]: 14
# 一共返回了14个表的DataFrame,取第一个
In[32]: df0 = df_list[0]
df0.shape
Out[32]: (324, 1906)
In[33]: df0.head(7)
Out[33]:
# 用match参数匹配table中的字符串
In[34]: df_list = pd.read_html(trump_url, match='Start Date')
len(df_list)
Out[34]: 3
# 通过检查页面元素的属性,用attrs参数进行匹配
In[35]: df_list = pd.read_html(trump_url, match='Start Date', attrs={'align':'center'})
len(df_list)
Out[35]: 1
# 查看DataFrame的形状
In[36]: trump = df_list[0]
trump.shape
Out[36]: (265, 19)
In[37]: trump.head(8)
Out[37]:
# skiprows可以指定跳过一些行,header参数可以指定列名,用parse_dates指定开始和结束日期
In[38]: df_list = pd.read_html(trump_url, match='Start Date', attrs={'align':'center'},
header=0, skiprows=[0,1,2,3,5], parse_dates=['Start Date', 'End Date'])
trump = df_list[0]
trump.head()
Out[38]:
# 删除所有值都是缺失值的列
In[39]: trump = trump.dropna(axis=1, how='all')
trump.head()
Out[39]:
# 统计各列的缺失值个数
In[40]: trump.isnull().sum()
Out[40]: President 258
Start Date 0
End Date 0
Approving 0
Disapproving 0
unsure/no data 0
dtype: int64
# 缺失值向前填充
In[41]: trump = trump.ffill()
trump.head()
Out[41]:
# 确认数据类型
In[42]: trump.dtypes
Out[42]: President object
Start Date datetime64[ns]
End Date datetime64[ns]
Approving int64
Disapproving int64
unsure/no data int64
dtype: object
# 将前面的步骤做成一个函数,用于获取任意总统的信息
In[43]: def get_pres_appr(pres_num):
base_url = 'http://www.presidency.ucsb.edu/data/popularity.php?pres={}'
pres_url = base_url.format(pres_num)
df_list = pd.read_html(pres_url, match='Start Date', attrs={'align':'center'},
header=0, skiprows=[0,1,2,3,5], parse_dates=['Start Date', 'End Date'])
pres = df_list[0].copy()
pres = pres.dropna(axis=1, how='all')
pres['President'] = pres['President'].ffill()
return pres.sort_values('End Date').reset_index(drop=True)
# 括号中的数字是总统的编号,奥巴马是44
In[44]: obama = get_pres_appr(44)
obama.head()
Out[44]:
# 获取最近五位总统的数据,输出每位的前三行数据
In[45]: pres_41_45 = pd.concat([get_pres_appr(x) for x in range(41,46)], ignore_index=True)
pres_41_45.groupby('President').head(3)
Out[45]:
# 确认一下是否有一个日期对应多个支持率
In[46]: pres_41_45['End Date'].value_counts().head(8)
Out[46]: 1990-03-11 2
1990-08-12 2
1990-08-26 2
2013-10-10 2
1999-02-09 2
1992-11-22 2
1990-05-22 2
2005-01-05 1
Name: End Date, dtype: int64
# 去除重复值
In[47]: pres_41_45 = pres_41_45.drop_duplicates(subset='End Date')
In[48]: pres_41_45.shape
Out[48]: (3695, 6)
# 对数据做简单的统计
In[49]: pres_41_45['President'].value_counts()
Out[49]: Barack Obama 2786
George W. Bush 270
Donald J. Trump 259
William J. Clinton 227
George Bush 153
Name: President, dtype: int64
In[50]: pres_41_45.groupby('President', sort=False).median().round(1)
Out[50]:
# 画出每任总统的支持率变化
In[51]: from matplotlib import cm
fig, ax = plt.subplots(figsize=(16,6))
styles = ['-.', '-', ':', '-', ':']
colors = [.9, .3, .7, .3, .9]
groups = pres_41_45.groupby('President', sort=False)
for style, color, (pres, df) in zip(styles, colors, groups):
df.plot('End Date', 'Approving', ax=ax, label=pres, style=style, color=cm.Greys(color),
title='Presedential Approval Rating')
# 上面的图是将数据前后串起来,也可以用支持率对在职天数作图
In[52]: days_func = lambda x: x - x.iloc[0]
pres_41_45['Days in Office'] = pres_41_45.groupby('President') \
['End Date'] \
.transform(days_func)
In[82]: pres_41_45['Days in Office'] = pres_41_45.groupby('President')['End Date'].transform(lambda x: x - x.iloc[0])
pres_41_45.groupby('President').head(3)
Out[82]:
# 查看数据类型
In[83]: pres_41_45.dtypes
Out[83]: President object
Start Date datetime64[ns]
End Date datetime64[ns]
Approving int64
Disapproving int64
unsure/no data int64
Days in Office timedelta64[ns]
dtype: object
# Days in Office的数据类型是timedelta64[ns],单位是纳秒,将其转换为整数
In[86]: pres_41_45['Days in Office'] = pres_41_45['Days in Office'].dt.days
pres_41_45['Days in Office'].head()
Out[86]: 0 0
1 32
2 35
3 43
4 46
Name: Days in Office, dtype: int64
# 转换数据,使每位总统的支持率各成一列
In[87]: pres_pivot = pres_41_45.pivot(index='Days in Office', columns='President', values='Approving')
pres_pivot.head()
Out[87]:
# 只画出特朗普和奥巴马的支持率
In[88]: plot_kwargs = dict(figsize=(16,6), color=cm.gray([.3, .7]), style=['-', '--'], title='Approval Rating')
pres_pivot.loc[:250, ['Donald J. Trump', 'Barack Obama']].ffill().plot(**plot_kwargs)
Out[88]: <matplotlib.axes._subplots.AxesSubplot at 0x1152254a8>
更多
# rolling average方法可以平滑曲线,在这个例子中,使用的是90天求平均,参数on指明了滚动窗口是从哪列计算的
In[89]: pres_rm = pres_41_45.groupby('President', sort=False) \
.rolling('90D', on='End Date')['Approving'] \
.mean()
pres_rm.head()
Out[89]: President End Date
George Bush 1989-01-26 51.000000
1989-02-27 55.500000
1989-03-02 57.666667
1989-03-10 58.750000
1989-03-13 58.200000
Name: Approving, dtype: float64
# 对数据的行和列做调整,然后作图
In[90]: styles = ['-.', '-', ':', '-', ':']
colors = [.9, .3, .7, .3, .9]
color = cm.Greys(colors)
title='90 Day Approval Rating Rolling Average'
plot_kwargs = dict(figsize=(16,6), style=styles, color = color, title=title)
correct_col_order = pres_41_45.President.unique()
pres_rm.unstack('President')[correct_col_order].plot(**plot_kwargs)
Out[90]: <matplotlib.axes._subplots.AxesSubplot at 0x1162d0780>