3. 索引爆炸

  1. # 读取employee数据集,设定行索引是RACE
  2. In[22]: employee = pd.read_csv('data/employee.csv', index_col='RACE')
  3. employee.head()
  4. Out[22]:

3. 索引爆炸 - 图1

  1. # 选取BASE_SALARY做成两个Series,判断二者是否相同
  2. In[23]: salary1 = employee['BASE_SALARY']
  3. salary2 = employee['BASE_SALARY']
  4. salary1 is salary2
  5. Out[23]: True
  1. # 结果是True,表明二者指向的同一个对象。这意味着,如果修改一个,另一个也会去改变。为了收到一个全新的数据,使用copy方法:
  2. In[24]: salary1 = employee['BASE_SALARY'].copy()
  3. salary2 = employee['BASE_SALARY'].copy()
  4. salary1 is salary2
  5. Out[24]: False
  1. # 对其中一个做索引排序,比较二者是否不同
  2. In[25]: salary1 = salary1.sort_index()
  3. salary1.head()
  4. Out[25]: RACE
  5. American Indian or Alaskan Native 78355.0
  6. American Indian or Alaskan Native 26125.0
  7. American Indian or Alaskan Native 98536.0
  8. American Indian or Alaskan Native NaN
  9. American Indian or Alaskan Native 55461.0
  10. Name: BASE_SALARY, dtype: float64
  11. In[26]: salary2.head()
  12. Out[26]: RACE
  13. Hispanic/Latino 121862.0
  14. Hispanic/Latino 26125.0
  15. White 45279.0
  16. White 63166.0
  17. White 56347.0
  18. Name: BASE_SALARY, dtype: float64
  1. # 将两个Series相加
  2. In[27]: salary_add = salary1 + salary2
  3. In[28]: salary_add.head()
  4. Out[28]: RACE
  5. American Indian or Alaskan Native 138702.0
  6. American Indian or Alaskan Native 156710.0
  7. American Indian or Alaskan Native 176891.0
  8. American Indian or Alaskan Native 159594.0
  9. American Indian or Alaskan Native 127734.0
  10. Name: BASE_SALARY, dtype: float64
  1. # 再将salary1与其自身相加;查看几个所得结果的长度,可以看到长度从2000到达了117万
  2. In[29]: salary_add1 = salary1 + salary1
  3. len(salary1), len(salary2), len(salary_add), len(salary_add1)
  4. Out[29]: (2000, 2000, 1175424, 2000)

更多

  1. # 验证salary_add值的个数。因为笛卡尔积是作用在相同索引元素上的,可以对其平方值求和
  2. In[30]: index_vc = salary1.index.value_counts(dropna=False)
  3. index_vc
  4. Out[30]: Black or African American 700
  5. White 665
  6. Hispanic/Latino 480
  7. Asian/Pacific Islander 107
  8. NaN 35
  9. American Indian or Alaskan Native 11
  10. Others 2
  11. Name: RACE, dtype: int64
  12. In[31]: index_vc.pow(2).sum()
  13. Out[31]: 1175424