12. 当多个观察单位被存储于同一张表时进行清理
# 读取movie_altered数据集
In[88]: movie = pd.read_csv('data/movie_altered.csv')
movie.head()
out[88]:
# 插入新的列,用来标识每一部电影
In[89]: movie.insert(0, 'id', np.arange(len(movie)))
movie.head()
out[89]:
# 用wide_to_long,将所有演员放到一列,将所有Facebook likes放到一列
In[90]: stubnames = ['director', 'director_fb_likes', 'actor', 'actor_fb_likes']
movie_long = pd.wide_to_long(movie,
stubnames=stubnames,
i='id',
j='num',
sep='_').reset_index()
movie_long['num'] = movie_long['num'].astype(int)
movie_long.head(9)
out[90]:
# 将这个数据分解成多个小表
In[91]: movie_table = movie_long[['id','title', 'year', 'duration', 'rating']]
director_table = movie_long[['id', 'director', 'num', 'director_fb_likes']]
actor_table = movie_long[['id', 'actor', 'num', 'actor_fb_likes']]
In[92]: movie_table.head(9)
out[90]:
In[93]: director_table.head(9)
out[93]:
In[94]: actor_table.head(9)
out[94]:
# 做一些去重和去除缺失值的工作
In[95]: movie_table = movie_table.drop_duplicates().reset_index(drop=True)
director_table = director_table.dropna().reset_index(drop=True)
actor_table = actor_table.dropna().reset_index(drop=True)
In[96]: movie_table.head()
out[96]:
In[97]: director_table.head()
out[97]:
# 比较内存的使用量
In[98]: movie.memory_usage(deep=True).sum()
out[98]: 2318234
In[99]: movie_table.memory_usage(deep=True).sum() + \
director_table.memory_usage(deep=True).sum() + \
actor_table.memory_usage(deep=True).sum()
out[99]: 2624898
# 创建演员和导演的id列
In[100]: director_cat = pd.Categorical(director_table['director'])
director_table.insert(1, 'director_id', director_cat.codes)
actor_cat = pd.Categorical(actor_table['actor'])
actor_table.insert(1, 'actor_id', actor_cat.codes)
director_table.head()
out[100]:
In[101]: actor_table.head()
out[101]:
# 可以用这两张表生成要用的中间表。先来做director表
In[102]: director_associative = director_table[['id', 'director_id', 'num']]
dcols = ['director_id', 'director', 'director_fb_likes']
director_unique = director_table[dcols].drop_duplicates().reset_index(drop=True)
director_associative.head()
out[102]:
In[103]: director_unique.head()
out[103]:
# 再来做actor表
In[104]: actor_associative = actor_table[['id', 'actor_id', 'num']]
acols = ['actor_id', 'actor', 'actor_fb_likes']
actor_unique = actor_table[acols].drop_duplicates().reset_index(drop=True)
actor_associative.head()
out[104]:
In[105]: actor_unique.head()
out[105]:
# 查看新的表所使用的内存量
In[106]: movie_table.memory_usage(deep=True).sum() + \
director_associative.memory_usage(deep=True).sum() + \
director_unique.memory_usage(deep=True).sum() + \
actor_associative.memory_usage(deep=True).sum() + \
actor_unique.memory_usage(deep=True).sum()
out[106]: 1833402
In[107]: movie_table.head()
out[107]:
# 可以通过将左右表组合起来形成movie表。首先将附表与actor/director表结合,然后将num列pivot,再加上列的前缀
In[108]: actors = actor_associative.merge(actor_unique, on='actor_id') \
.drop('actor_id', 1) \
.pivot_table(index='id', columns='num', aggfunc='first')
actors.columns = actors.columns.get_level_values(0) + '_' + \
actors.columns.get_level_values(1).astype(str)
directors = director_associative.merge(director_unique, on='director_id') \
.drop('director_id', 1) \
.pivot_table(index='id', columns='num', aggfunc='first')
directors.columns = directors.columns.get_level_values(0) + '_' + \
directors.columns.get_level_values(1).astype(str)
In[109]: actors.head()
out[109]:
In[110]: directors.head()
out[110]:
In[111]: movie2 = movie_table.merge(directors.reset_index(), on='id', how='left') \
.merge(actors.reset_index(), on='id', how='left')
In[112]: movie2.head()
out[112]:
In[113]: movie.equals(movie2[movie.columns])
out[113]: True