8. 减肥对赌
# 读取减肥数据集,查看一月的数据
In[63]: weight_loss = pd.read_csv('data/weight_loss.csv')
weight_loss.query('Month == "Jan"')
Out[63]:
# 定义一个求减肥比例的函数
In[64]: def find_perc_loss(s):
return (s - s.iloc[0]) / s.iloc[0]
# 查看Bob在一月的减肥成果
In[65]: bob_jan = weight_loss.query('Name=="Bob" and Month=="Jan"')
find_perc_loss(bob_jan['Weight'])
Out[65]: 0 0.000000
2 -0.010309
4 -0.027491
6 -0.027491
Name: Weight, dtype: float64
# 对Name和Month进行分组,然后使用transform方法,传入函数,对数值进行转换
In[66]: pcnt_loss = weight_loss.groupby(['Name', 'Month'])['Weight'].transform(find_perc_loss)
pcnt_loss.head(8)
Out[66]: 0 0.000000
1 0.000000
2 -0.010309
3 -0.040609
4 -0.027491
5 -0.040609
6 -0.027491
7 -0.035533
Name: Weight, dtype: float64
# transform之后的结果,行数不变,可以赋值给原始DataFrame作为一个新列;
# 为了缩短输出,只选择Bob的前两个月数据
In[67]: weight_loss['Perc Weight Loss'] = pcnt_loss.round(3)
weight_loss.query('Name=="Bob" and Month in ["Jan", "Feb"]')
Out[67]:
# 因为最重要的是每个月的第4周,只选择第4周的数据
In[68]: week4 = weight_loss.query('Week == "Week 4"')
week4
Out[68]:
# 用pivot重构DataFrame,让Amy和Bob的数据并排放置
In[69]: winner = week4.pivot(index='Month', columns='Name', values='Perc Weight Loss')
winner
Out[69]:
# 用where方法选出每月的赢家
In[70]: winner['Winner'] = np.where(winner['Amy'] < winner['Bob'], 'Amy', 'Bob')
winner.style.highlight_min(axis=1)
Out[70]:
# 用value_counts()返回最后的比分
In[71]: winner.Winner.value_counts()
Out[71]: Amy 3
Bob 1
Name: Winner, dtype: int64
更多
# Pandas默认是按字母排序的
In[72]: week4a = week4.copy()
month_chron = week4a['Month'].unique()
month_chron
Out[72]: array(['Jan', 'Feb', 'Mar', 'Apr'], dtype=object)
# 转换为Categorical变量,可以做成按时间排序
In[73]: week4a['Month'] = pd.Categorical(week4a['Month'],
categories=month_chron,
ordered=True)
week4a.pivot(index='Month', columns='Name', values='Perc Weight Loss')
Out[73]: