第08章数据清理 - 9. 当多个变量被存储为列的值时进行清理 - 《Pandas Cookbook 带注释源码》

9. 当多个变量被存储为列的值时进行清理

#  读取restaurant_inspections数据集，将Date列的数据类型变为datetime64
 In[67]: inspections = pd.read_csv('data/restaurant_inspections.csv', parse_dates=['Date'])
         inspections.head(10)
out[67]:

#  用info列的所有值造一个新列。但是，Pandas不支持这种功能
 In[68]: inspections.pivot(index=['Name', 'Date'], columns='Info', values='Value')
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/categorical.py in __init__(self, values, categories, ordered, fastpath)
    297             try:
--> 298                 codes, categories = factorize(values, sort=True)
    299             except TypeError:
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/algorithms.py in factorize(values, sort, order, na_sentinel, size_hint)
    559     check_nulls = not is_integer_dtype(original)
--> 560     labels = table.get_labels(values, uniques, 0, na_sentinel, check_nulls)
    561 
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_labels (pandas/_libs/hashtable.c:21922)()
ValueError: Buffer has wrong number of dimensions (expected 1, got 2)
During handling of the above exception, another exception occurred:
NotImplementedError                       Traceback (most recent call last)
<ipython-input-68-754f69d68d6c> in <module>()
----> 1 inspections.pivot(index=['Name', 'Date'], columns='Info', values='Value')
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/frame.py in pivot(self, index, columns, values)
   3851         """
   3852         from pandas.core.reshape.reshape import pivot
-> 3853         return pivot(self, index=index, columns=columns, values=values)
   3854 
   3855     def stack(self, level=-1, dropna=True):
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/reshape/reshape.py in pivot(self, index, columns, values)
    375             index = self[index]
    376         indexed = Series(self[values].values,
--> 377                          index=MultiIndex.from_arrays([index, self[columns]]))
    378         return indexed.unstack(columns)
    379 
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/indexes/multi.py in from_arrays(cls, arrays, sortorder, names)
   1098         from pandas.core.categorical import _factorize_from_iterables
   1099 
-> 1100         labels, levels = _factorize_from_iterables(arrays)
   1101         if names is None:
   1102             names = [getattr(arr, "name", None) for arr in arrays]
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/categorical.py in _factorize_from_iterables(iterables)
   2191         #  For consistency, it should return a list of 2 lists.
   2192         return [[], []]
-> 2193     return map(list, lzip(*[_factorize_from_iterable(it) for it in iterables]))
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/categorical.py in <listcomp>(.0)
   2191         #  For consistency, it should return a list of 2 lists.
   2192         return [[], []]
-> 2193     return map(list, lzip(*[_factorize_from_iterable(it) for it in iterables]))
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/categorical.py in _factorize_from_iterable(values)
   2163         codes = values.codes
   2164     else:
-> 2165         cat = Categorical(values, ordered=True)
   2166         categories = cat.categories
   2167         codes = cat.codes
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/categorical.py in __init__(self, values, categories, ordered, fastpath)
    308 
    309                 #  FIXME
--> 310                 raise NotImplementedError("> 1 ndim Categorical are not "
    311                                           "supported at this time")
    312 
NotImplementedError: > 1 ndim Categorical are not supported at this time

#  将'Name','Date', 'Info'作为所索引
 In[69]: inspections.set_index(['Name','Date', 'Info']).head(10)
out[69]:

#  用pivot，将info列中的值变为新的列
 In[70]: inspections.set_index(['Name','Date', 'Info']).unstack('Info').head()
out[70]:

#  用reset_index方法，使行索引层级与列索引相同
 In[71]: insp_tidy = inspections.set_index(['Name','Date', 'Info']) \
                                        .unstack('Info') \
                                        .reset_index(col_level=-1)
         insp_tidy.head()
out[71]:

#  除掉列索引的最外层，重命名行索引的层为None
 In[72]: insp_tidy.columns = insp_tidy.columns.droplevel(0).rename(None)
         insp_tidy.head()
out[72]:

#  使用squeeze方法，可以避免前面的多级索引
 In[73]: inspections.set_index(['Name','Date', 'Info']) \
                    .squeeze() \
                    .unstack('Info') \
                    .reset_index() \
                    .rename_axis(None, axis='columns')
out[73]:

#  pivot_table需要传入聚合函数，才能产生一个单一值
 In[74]: inspections.pivot_table(index=['Name', 'Date'], 
                                 columns='Info', 
                                 values='Value', 
                                 aggfunc='first') \
                    .reset_index()\
                    .rename_axis(None, axis='columns')
out[74]:

9. 当多个变量被存储为列的值时进行清理

9. 当多个变量被存储为列的值时进行清理

更多