3. 同时stack多组变量
# 读取movie数据集,选取所有演员名和其Facebook likes
In[18]: movie = pd.read_csv('data/movie.csv')
actor = movie[['movie_title', 'actor_1_name', 'actor_2_name', 'actor_3_name',
'actor_1_facebook_likes', 'actor_2_facebook_likes', 'actor_3_facebook_likes']]
actor.head()
out[18]:
# 创建一个自定义函数,用来改变列名。wide_to_long要求分组的变量要有相同的数字结尾:
In[19]: def change_col_name(col_name):
col_name = col_name.replace('_name', '')
if 'facebook' in col_name:
fb_idx = col_name.find('facebook')
col_name = col_name[:5] + col_name[fb_idx - 1:] + col_name[5:fb_idx-1]
return col_name
In[20]: actor2 = actor.rename(columns=change_col_name)
actor2.head()
out[20]:
# 使用wide_to_long函数,同时stack两列actor和Facebook
In[21]: stubs = ['actor', 'actor_facebook_likes']
actor2_tidy = pd.wide_to_long(actor2,
stubnames=stubs,
i=['movie_title'],
j='actor_num',
sep='_').reset_index()
actor2_tidy.head()
out[21]:
更多
# 加载数据
In[22]: df = pd.read_csv('data/stackme.csv')
df
out[22]:
# 对列重命名
In[23]: df2 = df.rename(columns = {'a1':'group1_a1', 'b2':'group1_b2',
'd':'group2_a1', 'e':'group2_b2'})
df2
out[23]:
# 设定stubnames=['group1', 'group2'],对任何数字都起作用
In[24]: pd.wide_to_long(df2,
stubnames=['group1', 'group2'],
i=['State', 'Country', 'Test'],
j='Label',
suffix='.+',
sep='_')
out[24]: