3. 同时stack多组变量

  1. # 读取movie数据集,选取所有演员名和其Facebook likes
  2. In[18]: movie = pd.read_csv('data/movie.csv')
  3. actor = movie[['movie_title', 'actor_1_name', 'actor_2_name', 'actor_3_name',
  4. 'actor_1_facebook_likes', 'actor_2_facebook_likes', 'actor_3_facebook_likes']]
  5. actor.head()
  6. out[18]:

3. 同时stack多组变量 - 图1

  1. # 创建一个自定义函数,用来改变列名。wide_to_long要求分组的变量要有相同的数字结尾:
  2. In[19]: def change_col_name(col_name):
  3. col_name = col_name.replace('_name', '')
  4. if 'facebook' in col_name:
  5. fb_idx = col_name.find('facebook')
  6. col_name = col_name[:5] + col_name[fb_idx - 1:] + col_name[5:fb_idx-1]
  7. return col_name
  8. In[20]: actor2 = actor.rename(columns=change_col_name)
  9. actor2.head()
  10. out[20]:

3. 同时stack多组变量 - 图2

  1. # 使用wide_to_long函数,同时stack两列actor和Facebook
  2. In[21]: stubs = ['actor', 'actor_facebook_likes']
  3. actor2_tidy = pd.wide_to_long(actor2,
  4. stubnames=stubs,
  5. i=['movie_title'],
  6. j='actor_num',
  7. sep='_').reset_index()
  8. actor2_tidy.head()
  9. out[21]:

3. 同时stack多组变量 - 图3

更多

  1. # 加载数据
  2. In[22]: df = pd.read_csv('data/stackme.csv')
  3. df
  4. out[22]:

3. 同时stack多组变量 - 图4

  1. # 对列重命名
  2. In[23]: df2 = df.rename(columns = {'a1':'group1_a1', 'b2':'group1_b2',
  3. 'd':'group2_a1', 'e':'group2_b2'})
  4. df2
  5. out[23]:

3. 同时stack多组变量 - 图5

  1. # 设定stubnames=['group1', 'group2'],对任何数字都起作用
  2. In[24]: pd.wide_to_long(df2,
  3. stubnames=['group1', 'group2'],
  4. i=['State', 'Country', 'Test'],
  5. j='Label',
  6. suffix='.+',
  7. sep='_')
  8. out[24]:

3. 同时stack多组变量 - 图6