4. 反转stacked数据
# 读取college数据集,学校名作为行索引,,只选取本科生的列
In[25]: usecol_func = lambda x: 'UGDS_' in x or x == 'INSTNM'
college = pd.read_csv('data/college.csv',
index_col='INSTNM',
usecols=usecol_func)
college.head()
out[25]:
# 用stack方法,将所有水平列名,转化为垂直的行索引
In[26]: college_stacked = college.stack()
college_stacked.head(18)
out[26]: INSTNM
Alabama A & M University UGDS_WHITE 0.0333
UGDS_BLACK 0.9353
UGDS_HISP 0.0055
UGDS_ASIAN 0.0019
UGDS_AIAN 0.0024
UGDS_NHPI 0.0019
UGDS_2MOR 0.0000
UGDS_NRA 0.0059
UGDS_UNKN 0.0138
University of Alabama at Birmingham UGDS_WHITE 0.5922
UGDS_BLACK 0.2600
UGDS_HISP 0.0283
UGDS_ASIAN 0.0518
UGDS_AIAN 0.0022
UGDS_NHPI 0.0007
UGDS_2MOR 0.0368
UGDS_NRA 0.0179
UGDS_UNKN 0.0100
dtype: float64
# unstack方法可以将其还原
In[27]: college_stacked.unstack().head()
out[27]:
# 另一种方式是先用melt,再用pivot。先加载数据,不指定行索引名
In[28]: college2 = pd.read_csv('data/college.csv',
usecols=usecol_func)
college2.head()
out[28]:
# 使用melt,将所有race列变为一列
In[29]: college_melted = college2.melt(id_vars='INSTNM',
var_name='Race',
value_name='Percentage')
college_melted.head()
out[29]:
# 用pivot还原
In[30]: melted_inv = college_melted.pivot(index='INSTNM',
columns='Race',
values='Percentage')
melted_inv.head()
out[30]:
# 用loc同时选取行和列,然后重置索引,可以获得和原先索引顺序一样的DataFrame
In[31]: college2_replication = melted_inv.loc[college2['INSTNM'],
college2.columns[1:]]\
.reset_index()
college2.equals(college2_replication)
out[31]: True
更多
# 使用最外层的行索引做unstack
In[32]: college.stack().unstack(0)
out[32]:
# 转置DataFrame更简单的方法是transpose()或T
In[33]: college.T
out[33]: