8. 用Seaborn破解diamonds数据集的辛普森悖论
In[95]: pd.DataFrame(index=['Student A', 'Student B'],
data={'Raw Score': ['50/100', '80/100'],
'Percent Correct':[50,80]}, columns=['Raw Score', 'Percent Correct'])
Out[95]:
In[96]: pd.DataFrame(index=['Student A', 'Student B'],
data={'Difficult': ['45/95', '2/5'],
'Easy': ['5/5', '78/95'],
'Difficult Percent': [47, 40],
'Easy Percent' : [100, 82],
'Total Percent':[50, 80]},
columns=['Difficult', 'Easy', 'Difficult Percent', 'Easy Percent', 'Total Percent'])
Out[96]:
# 读取diamonds数据集
In[97]: diamonds = pd.read_csv('data/diamonds.csv')
diamonds.head()
Out[97]:
# 将cut、color、clarity列变为有序类型
In[98]: cut_cats = ['Fair', 'Good', 'Very Good', 'Premium', 'Ideal']
color_cats = ['J', 'I', 'H', 'G', 'F', 'E', 'D']
clarity_cats = ['I1', 'SI2', 'SI1', 'VS2', 'VS1', 'VVS2', 'VVS1', 'IF']
diamonds['cut'] = pd.Categorical(diamonds['cut'],
categories=cut_cats,
ordered=True)
diamonds['color'] = pd.Categorical(diamonds['color'],
categories=color_cats,
ordered=True)
diamonds['clarity'] = pd.Categorical(diamonds['clarity'],
categories=clarity_cats,
ordered=True)
In[99]: fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(14,4))
sns.barplot(x='color', y='price', data=diamonds, ax=ax1)
sns.barplot(x='cut', y='price', data=diamonds, ax=ax2)
sns.barplot(x='clarity', y='price', data=diamonds, ax=ax3)
fig.suptitle('Price Decreasing with Increasing Quality?')
Out[98]: Text(0.5,0.98,'Price Decreasing with Increasing Quality?')
# 画出每种钻石颜色和价格的关系
In[100]: sns.factorplot(x='color', y='price', col='clarity',
col_wrap=4, data=diamonds, kind='bar')
Out[100]: <seaborn.axisgrid.FacetGrid at 0x11b61d5f8>
# 用克拉值取代价格
In[101]: sns.factorplot(x='color', y='carat', col='clarity',
col_wrap=4, data=diamonds, kind='bar')
Out[101]: <seaborn.axisgrid.FacetGrid at 0x11e42eef0>
In[102]: fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(14,4))
sns.barplot(x='color', y='carat', data=diamonds, ax=ax1)
sns.barplot(x='cut', y='carat', data=diamonds, ax=ax2)
sns.barplot(x='clarity', y='carat', data=diamonds, ax=ax3)
fig.suptitle('Diamond size decreases with quality')
Out[102]: Text(0.5,0.98,'Diamond size decreases with quality')
# 下图显示钻石越大,价格越高
In[103]: diamonds['carat_category'] = pd.qcut(diamonds.carat, 5)
from matplotlib.cm import Greys
greys = Greys(np.arange(50,250,40))
g = sns.factorplot(x='clarity', y='price', data=diamonds,
hue='carat_category', col='color',
col_wrap=4, kind='point') # , palette=greys)
g.fig.suptitle('Diamond price by size, color and clarity',
y=1.02, size=20)
Out[103]: Text(0.5,1.02,'Diamond price by size, color and clarity')
更多
# 用seaborn更高级的PairGrid构造器,对二元变量作图
In[104]: g = sns.PairGrid(diamonds,size=5,
x_vars=["color", "cut", "clarity"],
y_vars=["price"])
g.map(sns.barplot)
g.fig.suptitle('Replication of Step 3 with PairGrid', y=1.02)
Out[104]: Text(0.5,1.02,'Replication of Step 3 with PairGrid')