7. 使用Seaborn网格做多变量分析
# 读取employee数据集,创建工龄的列
In[85]: employee = pd.read_csv('data/employee.csv',
parse_dates=['HIRE_DATE', 'JOB_DATE'])
days_hired = (pd.to_datetime('12-1-2016') - employee['HIRE_DATE'])
one_year = pd.Timedelta(1, unit='Y')
employee['YEARS_EXPERIENCE'] = days_hired / one_year
employee[['HIRE_DATE', 'YEARS_EXPERIENCE']].head()
Out[85]:
# 画一个基本的带有回归线的散点图
In[86]: import seaborn as sns
In[87]: ax = sns.regplot(x='YEARS_EXPERIENCE', y='BASE_SALARY',
data=employee)
ax.figure.set_size_inches(14,4)
Out[87]:
# 用regplot的上层函数lmplot,画出不同性别的回归线
In[88]: grid = sns.lmplot(x='YEARS_EXPERIENCE', y='BASE_SALARY',
hue='GENDER', palette='Greys',
scatter_kws={'s':10}, data=employee)
grid.fig.set_size_inches(14, 4)
type(grid)
Out[88]: seaborn.axisgrid.FacetGrid
# 为每个种族创建子图,同时保留回归线
In[89]: grid = sns.lmplot(x='YEARS_EXPERIENCE', y='BASE_SALARY',
hue='GENDER', col='RACE', col_wrap=3,
palette='Greys', sharex=False,
line_kws = {'linewidth':5},
data=employee)
grid.set(ylim=(20000, 120000))
Out[89]: <seaborn.axisgrid.FacetGrid at 0x11e7ce470>
# 将类型值的层级减小到二,将部门的层级减小到三
In[90]: deps = employee['DEPARTMENT'].value_counts().index[:2]
races = employee['RACE'].value_counts().index[:3]
is_dep = employee['DEPARTMENT'].isin(deps)
is_race = employee['RACE'].isin(races)
emp2 = employee[is_dep & is_race].copy()
emp2['DEPARTMENT'] = emp2.DEPARTMENT.str.extract('(HPD|HFD)', expand=True)
emp2.shape
Out[90]: (968, 11)
In[91]: emp2['DEPARTMENT'].value_counts()
Out[91]: HPD 591
HFD 377
Name: DEPARTMENT, dtype: int64
In[92]: emp2['RACE'].value_counts()
Out[92]: White 478
Hispanic/Latino 250
Black or African American 240
Name: RACE, dtype: int64
# 用Axe层函数,比如violinplot来画出工龄和性别的分布
In[93]: ax = sns.violinplot(x = 'YEARS_EXPERIENCE', y='GENDER', data=emp2)
ax.figure.set_size_inches(10,4)
Out[93]:
# 用factorplot函数,为每个部门和种族的组合画图
In[94]: sns.factorplot(x ='YEARS_EXPERIENCE', y='GENDER',
col='RACE', row='DEPARTMENT',
size=3, aspect=2,
data=emp2, kind='violin')
Out[94]: <seaborn.axisgrid.FacetGrid at 0x11e40ec50>