5. 从不同的DataFrame追加列
# 读取employee数据,选取'DEPARTMENT', 'BASE_SALARY'这两列
In[48]: employee = pd.read_csv('data/employee.csv')
dept_sal = employee[['DEPARTMENT', 'BASE_SALARY']]
# 在每个部门内,对BASE_SALARY进行排序
In[49]: dept_sal = dept_sal.sort_values(['DEPARTMENT', 'BASE_SALARY'],
ascending=[True, False])
# 用drop_duplicates方法保留每个部门的第一行
In[50]: max_dept_sal = dept_sal.drop_duplicates(subset='DEPARTMENT')
max_dept_sal.head()
Out[50]:
# 使用DEPARTMENT作为行索引
In[51]: max_dept_sal = max_dept_sal.set_index('DEPARTMENT')
employee = employee.set_index('DEPARTMENT')
# 现在行索引包含匹配值了,可以向employee的DataFrame新增一列
In[52]: employee['MAX_DEPT_SALARY'] = max_dept_sal['BASE_SALARY']
In[53]: pd.options.display.max_columns = 6
Out[54]:
# 现在可以用query查看是否有BASE_SALARY大于MAX_DEPT_SALARY的
In[55]: employee.query('BASE_SALARY > MAX_DEPT_SALARY')
Out[55]:
原理
# 用random从dept_sal随机取10行,不做替换
In[56]: np.random.seed(1234)
random_salary = dept_sal.sample(n=10).set_index('DEPARTMENT')
random_salary
Out[56]:
# random_salary中是有重复索引的,employee DataFrame的标签要对应random_salary中的多个标签
In[57]: employee['RANDOM_SALARY'] = random_salary['BASE_SALARY']
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-57-1cbebe15fa39> in <module>()
----> 1 employee['RANDOM_SALARY'] = random_salary['BASE_SALARY']
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/frame.py in __setitem__(self, key, value)
2329 else:
2330 # set column
-> 2331 self._set_item(key, value)
2332
2333 def _setitem_slice(self, key, value):
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/frame.py in _set_item(self, key, value)
2395
2396 self._ensure_valid_index(value)
-> 2397 value = self._sanitize_column(key, value)
2398 NDFrame._set_item(self, key, value)
2399
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/frame.py in _sanitize_column(self, key, value, broadcast)
2545
2546 if isinstance(value, Series):
-> 2547 value = reindexer(value)
2548
2549 elif isinstance(value, DataFrame):
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/frame.py in reindexer(value)
2537 # duplicate axis
2538 if not value.index.is_unique:
-> 2539 raise e
2540
2541 # other
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/frame.py in reindexer(value)
2532 # GH 4107
2533 try:
-> 2534 value = value.reindex(self.index)._values
2535 except Exception as e:
2536
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/series.py in reindex(self, index, **kwargs)
2424 @Appender(generic._shared_docs['reindex'] % _shared_doc_kwargs)
2425 def reindex(self, index=None, **kwargs):
-> 2426 return super(Series, self).reindex(index=index, **kwargs)
2427
2428 @Appender(generic._shared_docs['fillna'] % _shared_doc_kwargs)
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/generic.py in reindex(self, *args, **kwargs)
2513 # perform the reindex on the axes
2514 return self._reindex_axes(axes, level, limit, tolerance, method,
-> 2515 fill_value, copy).__finalize__(self)
2516
2517 def _reindex_axes(self, axes, level, limit, tolerance, method, fill_value,
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/generic.py in _reindex_axes(self, axes, level, limit, tolerance, method, fill_value, copy)
2531 obj = obj._reindex_with_indexers({axis: [new_index, indexer]},
2532 fill_value=fill_value,
-> 2533 copy=copy, allow_dups=False)
2534
2535 return obj
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/generic.py in _reindex_with_indexers(self, reindexers, fill_value, copy, allow_dups)
2625 fill_value=fill_value,
2626 allow_dups=allow_dups,
-> 2627 copy=copy)
2628
2629 if copy and new_data is self._data:
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/internals.py in reindex_indexer(self, new_axis, indexer, axis, fill_value, allow_dups, copy)
3884 # some axes don't allow reindexing with dups
3885 if not allow_dups:
-> 3886 self.axes[axis]._can_reindex(indexer)
3887
3888 if axis >= self.ndim:
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/indexes/base.py in _can_reindex(self, indexer)
2834 # trying to reindex on an axis with duplicates
2835 if not self.is_unique and len(indexer):
-> 2836 raise ValueError("cannot reindex from a duplicate axis")
2837
2838 def reindex(self, target, method=None, level=None, limit=None,
ValueError: cannot reindex from a duplicate axis
更多
# 选取max_dept_sal['BASE_SALARY']的前三行,赋值给employee['MAX_SALARY2']
In[58]: employee['MAX_SALARY2'] = max_dept_sal['BASE_SALARY'].head(3)
# 对MAX_SALARY2统计
In[59]: employee.MAX_SALARY2.value_counts()
Out[59]: 140416.0 29
100000.0 11
64251.0 5
Name: MAX_SALARY2, dtype: int64
# 因为只填充了三个部门的值,所有其它部门在结果中都是缺失值
In[60]: employee.MAX_SALARY2.isnull().mean()
Out[60]: 0.97750000000000004