9. 当多个变量被存储为列的值时进行清理
# 读取restaurant_inspections数据集,将Date列的数据类型变为datetime64
In[67]: inspections = pd.read_csv('data/restaurant_inspections.csv', parse_dates=['Date'])
inspections.head(10)
out[67]:
# 用info列的所有值造一个新列。但是,Pandas不支持这种功能
In[68]: inspections.pivot(index=['Name', 'Date'], columns='Info', values='Value')
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/categorical.py in __init__(self, values, categories, ordered, fastpath)
297 try:
--> 298 codes, categories = factorize(values, sort=True)
299 except TypeError:
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/algorithms.py in factorize(values, sort, order, na_sentinel, size_hint)
559 check_nulls = not is_integer_dtype(original)
--> 560 labels = table.get_labels(values, uniques, 0, na_sentinel, check_nulls)
561
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_labels (pandas/_libs/hashtable.c:21922)()
ValueError: Buffer has wrong number of dimensions (expected 1, got 2)
During handling of the above exception, another exception occurred:
NotImplementedError Traceback (most recent call last)
<ipython-input-68-754f69d68d6c> in <module>()
----> 1 inspections.pivot(index=['Name', 'Date'], columns='Info', values='Value')
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/frame.py in pivot(self, index, columns, values)
3851 """
3852 from pandas.core.reshape.reshape import pivot
-> 3853 return pivot(self, index=index, columns=columns, values=values)
3854
3855 def stack(self, level=-1, dropna=True):
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/reshape/reshape.py in pivot(self, index, columns, values)
375 index = self[index]
376 indexed = Series(self[values].values,
--> 377 index=MultiIndex.from_arrays([index, self[columns]]))
378 return indexed.unstack(columns)
379
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/indexes/multi.py in from_arrays(cls, arrays, sortorder, names)
1098 from pandas.core.categorical import _factorize_from_iterables
1099
-> 1100 labels, levels = _factorize_from_iterables(arrays)
1101 if names is None:
1102 names = [getattr(arr, "name", None) for arr in arrays]
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/categorical.py in _factorize_from_iterables(iterables)
2191 # For consistency, it should return a list of 2 lists.
2192 return [[], []]
-> 2193 return map(list, lzip(*[_factorize_from_iterable(it) for it in iterables]))
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/categorical.py in <listcomp>(.0)
2191 # For consistency, it should return a list of 2 lists.
2192 return [[], []]
-> 2193 return map(list, lzip(*[_factorize_from_iterable(it) for it in iterables]))
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/categorical.py in _factorize_from_iterable(values)
2163 codes = values.codes
2164 else:
-> 2165 cat = Categorical(values, ordered=True)
2166 categories = cat.categories
2167 codes = cat.codes
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/categorical.py in __init__(self, values, categories, ordered, fastpath)
308
309 # FIXME
--> 310 raise NotImplementedError("> 1 ndim Categorical are not "
311 "supported at this time")
312
NotImplementedError: > 1 ndim Categorical are not supported at this time
# 将'Name','Date', 'Info'作为所索引
In[69]: inspections.set_index(['Name','Date', 'Info']).head(10)
out[69]:
# 用pivot,将info列中的值变为新的列
In[70]: inspections.set_index(['Name','Date', 'Info']).unstack('Info').head()
out[70]:
# 用reset_index方法,使行索引层级与列索引相同
In[71]: insp_tidy = inspections.set_index(['Name','Date', 'Info']) \
.unstack('Info') \
.reset_index(col_level=-1)
insp_tidy.head()
out[71]:
# 除掉列索引的最外层,重命名行索引的层为None
In[72]: insp_tidy.columns = insp_tidy.columns.droplevel(0).rename(None)
insp_tidy.head()
out[72]:
# 使用squeeze方法,可以避免前面的多级索引
In[73]: inspections.set_index(['Name','Date', 'Info']) \
.squeeze() \
.unstack('Info') \
.reset_index() \
.rename_axis(None, axis='columns')
out[73]:
更多
# pivot_table需要传入聚合函数,才能产生一个单一值
In[74]: inspections.pivot_table(index=['Name', 'Date'],
columns='Info',
values='Value',
aggfunc='first') \
.reset_index()\
.rename_axis(None, axis='columns')
out[74]: