5.2 基本功能




  1. In [91]: obj = pd.Series([4.5, 7.2, -5.3, 3.6], index=['d', 'b', 'a', 'c'])
  2. In [92]: obj
  3. Out[92]:
  4. d 4.5
  5. b 7.2
  6. a -5.3
  7. c 3.6
  8. dtype: float64


  1. In [93]: obj2 = obj.reindex(['a', 'b', 'c', 'd', 'e'])
  2. In [94]: obj2
  3. Out[94]:
  4. a -5.3
  5. b 7.2
  6. c 3.6
  7. d 4.5
  8. e NaN
  9. dtype: float64


  1. In [95]: obj3 = pd.Series(['blue', 'purple', 'yellow'], index=[0, 2, 4])
  2. In [96]: obj3
  3. Out[96]:
  4. 0 blue
  5. 2 purple
  6. 4 yellow
  7. dtype: object
  8. In [97]: obj3.reindex(range(6), method='ffill')
  9. Out[97]:
  10. 0 blue
  11. 1 blue
  12. 2 purple
  13. 3 purple
  14. 4 yellow
  15. 5 yellow
  16. dtype: object


  1. In [98]: frame = pd.DataFrame(np.arange(9).reshape((3, 3)),
  2. ....: index=['a', 'c', 'd'],
  3. ....: columns=['Ohio', 'Texas', 'California'])
  4. In [99]: frame
  5. Out[99]:
  6. Ohio Texas California
  7. a 0 1 2
  8. c 3 4 5
  9. d 6 7 8
  10. In [100]: frame2 = frame.reindex(['a', 'b', 'c', 'd'])
  11. In [101]: frame2
  12. Out[101]:
  13. Ohio Texas California
  14. a 0.0 1.0 2.0
  15. b NaN NaN NaN
  16. c 3.0 4.0 5.0
  17. d 6.0 7.0 8.0


  1. In [102]: states = ['Texas', 'Utah', 'California']
  2. In [103]: frame.reindex(columns=states)
  3. Out[103]:
  4. Texas Utah California
  5. a 1 NaN 2
  6. c 4 NaN 5
  7. d 7 NaN 8


5.2 基本功能 - 图1



  1. In [105]: obj = pd.Series(np.arange(5.), index=['a', 'b', 'c', 'd', 'e'])
  2. In [106]: obj
  3. Out[106]:
  4. a 0.0
  5. b 1.0
  6. c 2.0
  7. d 3.0
  8. e 4.0
  9. dtype: float64
  10. In [107]: new_obj = obj.drop('c')
  11. In [108]: new_obj
  12. Out[108]:
  13. a 0.0
  14. b 1.0
  15. d 3.0
  16. e 4.0
  17. dtype: float64
  18. In [109]: obj.drop(['d', 'c'])
  19. Out[109]:
  20. a 0.0
  21. b 1.0
  22. e 4.0
  23. dtype: float64


  1. In [110]: data = pd.DataFrame(np.arange(16).reshape((4, 4)),
  2. .....: index=['Ohio', 'Colorado', 'Utah', 'New York'],
  3. .....: columns=['one', 'two', 'three', 'four'])
  4. In [111]: data
  5. Out[111]:
  6. one two three four
  7. Ohio 0 1 2 3
  8. Colorado 4 5 6 7
  9. Utah 8 9 10 11
  10. New York 12 13 14 15

用标签序列调用drop会从行标签(axis 0)删除值:

  1. In [112]: data.drop(['Colorado', 'Ohio'])
  2. Out[112]:
  3. one two three four
  4. Utah 8 9 10 11
  5. New York 12 13 14 15


  1. In [113]: data.drop('two', axis=1)
  2. Out[113]:
  3. one three four
  4. Ohio 0 2 3
  5. Colorado 4 6 7
  6. Utah 8 10 11
  7. New York 12 14 15
  8. In [114]: data.drop(['two', 'four'], axis='columns')
  9. Out[114]:
  10. one three
  11. Ohio 0 2
  12. Colorado 4 6
  13. Utah 8 10
  14. New York 12 14


  1. In [115]: obj.drop('c', inplace=True)
  2. In [116]: obj
  3. Out[116]:
  4. a 0.0
  5. b 1.0
  6. d 3.0
  7. e 4.0
  8. dtype: float64




  1. In [117]: obj = pd.Series(np.arange(4.), index=['a', 'b', 'c', 'd'])
  2. In [118]: obj
  3. Out[118]:
  4. a 0.0
  5. b 1.0
  6. c 2.0
  7. d 3.0
  8. dtype: float64
  9. In [119]: obj['b']
  10. Out[119]: 1.0
  11. In [120]: obj[1]
  12. Out[120]: 1.0
  13. In [121]: obj[2:4]
  14. Out[121]:
  15. c 2.0
  16. d 3.0
  17. dtype: float64
  18. In [122]: obj[['b', 'a', 'd']]
  19. Out[122]:
  20. b 1.0
  21. a 0.0
  22. d 3.0
  23. dtype: float64
  24. In [123]: obj[[1, 3]]
  25. Out[123]:
  26. b 1.0
  27. d 3.0
  28. dtype: float64
  29. In [124]: obj[obj < 2]
  30. Out[124]:
  31. a 0.0
  32. b 1.0
  33. dtype: float64


  1. In [125]: obj['b':'c']
  2. Out[125]:
  3. b 1.0
  4. c 2.0
  5. dtype: float64


  1. In [126]: obj['b':'c'] = 5
  2. In [127]: obj
  3. Out[127]:
  4. a 0.0
  5. b 5.0
  6. c 5.0
  7. d 3.0
  8. dtype: float64


  1. In [128]: data = pd.DataFrame(np.arange(16).reshape((4, 4)),
  2. .....: index=['Ohio', 'Colorado', 'Utah', 'New York'],
  3. .....: columns=['one', 'two', 'three', 'four'])
  4. In [129]: data
  5. Out[129]:
  6. one two three four
  7. Ohio 0 1 2 3
  8. Colorado 4 5 6 7
  9. Utah 8 9 10 11
  10. New York 12 13 14 15
  11. In [130]: data['two']
  12. Out[130]:
  13. Ohio 1
  14. Colorado 5
  15. Utah 9
  16. New York 13
  17. Name: two, dtype: int64
  18. In [131]: data[['three', 'one']]
  19. Out[131]:
  20. three one
  21. Ohio 2 0
  22. Colorado 6 4
  23. Utah 10 8
  24. New York 14 12


  1. In [132]: data[:2]
  2. Out[132]:
  3. one two three four
  4. Ohio 0 1 2 3
  5. Colorado 4 5 6 7
  6. In [133]: data[data['three'] > 5]
  7. Out[133]:
  8. one two three four
  9. Colorado 4 5 6 7
  10. Utah 8 9 10 11
  11. New York 12 13 14 15

选取行的语法data[:2]十分方便。向[ ]传递单一的元素或列表,就可选择列。


  1. In [134]: data < 5
  2. Out[134]:
  3. one two three four
  4. Ohio True True True True
  5. Colorado True False False False
  6. Utah False False False False
  7. New York False False False False
  8. In [135]: data[data < 5] = 0
  9. In [136]: data
  10. Out[136]:
  11. one two three four
  12. Ohio 0 0 0 0
  13. Colorado 0 5 6 7
  14. Utah 8 9 10 11
  15. New York 12 13 14 15





  1. In [137]: data.loc['Colorado', ['two', 'three']]
  2. Out[137]:
  3. two 5
  4. three 6
  5. Name: Colorado, dtype: int64


  1. In [138]: data.iloc[2, [3, 0, 1]]
  2. Out[138]:
  3. four 11
  4. one 8
  5. two 9
  6. Name: Utah, dtype: int64
  7. In [139]: data.iloc[2]
  8. Out[139]:
  9. one 8
  10. two 9
  11. three 10
  12. four 11
  13. Name: Utah, dtype: int64
  14. In [140]: data.iloc[[1, 2], [3, 0, 1]]
  15. Out[140]:
  16. four one two
  17. Colorado 7 0 5
  18. Utah 11 8 9


  1. In [141]: data.loc[:'Utah', 'two']
  2. Out[141]:
  3. Ohio 0
  4. Colorado 5
  5. Utah 9
  6. Name: two, dtype: int64
  7. In [142]: data.iloc[:, :3][data.three > 5]
  8. Out[142]:
  9. one two three
  10. Colorado 0 5 6
  11. Utah 8 9 10
  12. New York 12 13 14


笔记:在一开始设计pandas时,我觉得用frame[:, col]选取列过于繁琐(也容易出错),因为列的选择是非常常见的操作。我做了些取舍,将花式索引的功能(标签和整数)放到了ix运算符中。在实践中,这会导致许多边缘情况,数据的轴标签是整数,所以pandas团队决定创造loc和iloc运算符分别处理严格基于标签和整数的索引。

表5-4 DataFrame的索引选项



  1. ser = pd.Series(np.arange(3.))
  2. ser
  3. ser[-1]


  1. In [144]: ser
  2. Out[144]:
  3. 0 0.0
  4. 1 1.0
  5. 2 2.0
  6. dtype: float64


  1. In [145]: ser2 = pd.Series(np.arange(3.), index=['a', 'b', 'c'])
  2. In [146]: ser2[-1]
  3. Out[146]: 2.0


  1. In [147]: ser[:1]
  2. Out[147]:
  3. 0 0.0
  4. dtype: float64
  5. In [148]: ser.loc[:1]
  6. Out[148]:
  7. 0 0.0
  8. 1 1.0
  9. dtype: float64
  10. In [149]: ser.iloc[:1]
  11. Out[149]:
  12. 0 0.0
  13. dtype: float64



  1. In [150]: s1 = pd.Series([7.3, -2.5, 3.4, 1.5], index=['a', 'c', 'd', 'e'])
  2. In [151]: s2 = pd.Series([-2.1, 3.6, -1.5, 4, 3.1],
  3. .....: index=['a', 'c', 'e', 'f', 'g'])
  4. In [152]: s1
  5. Out[152]:
  6. a 7.3
  7. c -2.5
  8. d 3.4
  9. e 1.5
  10. dtype: float64
  11. In [153]: s2
  12. Out[153]:
  13. a -2.1
  14. c 3.6
  15. e -1.5
  16. f 4.0
  17. g 3.1
  18. dtype: float64


  1. In [154]: s1 + s2
  2. Out[154]:
  3. a 5.2
  4. c 1.1
  5. d NaN
  6. e 0.0
  7. f NaN
  8. g NaN
  9. dtype: float64



  1. In [155]: df1 = pd.DataFrame(np.arange(9.).reshape((3, 3)), columns=list('bcd'),
  2. .....: index=['Ohio', 'Texas', 'Colorado'])
  3. In [156]: df2 = pd.DataFrame(np.arange(12.).reshape((4, 3)), columns=list('bde'),
  4. .....: index=['Utah', 'Ohio', 'Texas', 'Oregon'])
  5. In [157]: df1
  6. Out[157]:
  7. b c d
  8. Ohio 0.0 1.0 2.0
  9. Texas 3.0 4.0 5.0
  10. Colorado 6.0 7.0 8.0
  11. In [158]: df2
  12. Out[158]:
  13. b d e
  14. Utah 0.0 1.0 2.0
  15. Ohio 3.0 4.0 5.0
  16. Texas 6.0 7.0 8.0
  17. Oregon 9.0 10.0 11.0


  1. In [159]: df1 + df2
  2. Out[159]:
  3. b c d e
  4. Colorado NaN NaN NaN NaN
  5. Ohio 3.0 NaN 6.0 NaN
  6. Oregon NaN NaN NaN NaN
  7. Texas 9.0 NaN 12.0 NaN
  8. Utah NaN NaN NaN NaN



  1. In [160]: df1 = pd.DataFrame({'A': [1, 2]})
  2. In [161]: df2 = pd.DataFrame({'B': [3, 4]})
  3. In [162]: df1
  4. Out[162]:
  5. A
  6. 0 1
  7. 1 2
  8. In [163]: df2
  9. Out[163]:
  10. B
  11. 0 3
  12. 1 4
  13. In [164]: df1 - df2
  14. Out[164]:
  15. A B
  16. 0 NaN NaN
  17. 1 NaN NaN



  1. In [165]: df1 = pd.DataFrame(np.arange(12.).reshape((3, 4)),
  2. .....: columns=list('abcd'))
  3. In [166]: df2 = pd.DataFrame(np.arange(20.).reshape((4, 5)),
  4. .....: columns=list('abcde'))
  5. In [167]: df2.loc[1, 'b'] = np.nan
  6. In [168]: df1
  7. Out[168]:
  8. a b c d
  9. 0 0.0 1.0 2.0 3.0
  10. 1 4.0 5.0 6.0 7.0
  11. 2 8.0 9.0 10.0 11.0
  12. In [169]: df2
  13. Out[169]:
  14. a b c d e
  15. 0 0.0 1.0 2.0 3.0 4.0
  16. 1 5.0 NaN 7.0 8.0 9.0
  17. 2 10.0 11.0 12.0 13.0 14.0
  18. 3 15.0 16.0 17.0 18.0 19.0


  1. In [170]: df1 + df2
  2. Out[170]:
  3. a b c d e
  4. 0 0.0 2.0 4.0 6.0 NaN
  5. 1 9.0 NaN 13.0 15.0 NaN
  6. 2 18.0 20.0 22.0 24.0 NaN
  7. 3 NaN NaN NaN NaN NaN


  1. In [171]: df1.add(df2, fill_value=0)
  2. Out[171]:
  3. a b c d e
  4. 0 0.0 2.0 4.0 6.0 4.0
  5. 1 9.0 5.0 13.0 15.0 9.0
  6. 2 18.0 20.0 22.0 24.0 14.0
  7. 3 15.0 16.0 17.0 18.0 19.0


  1. In [172]: 1 / df1
  2. Out[172]:
  3. a b c d
  4. 0 inf 1.000000 0.500000 0.333333
  5. 1 0.250000 0.200000 0.166667 0.142857
  6. 2 0.125000 0.111111 0.100000 0.090909
  7. In [173]: df1.rdiv(1)
  8. Out[173]:
  9. a b c d
  10. 0 inf 1.000000 0.500000 0.333333
  11. 1 0.250000 0.200000 0.166667 0.142857
  12. 2 0.125000 0.111111 0.100000 0.090909

表5-5 灵活的算术方法


  1. In [174]: df1.reindex(columns=df2.columns, fill_value=0)
  2. Out[174]:
  3. a b c d e
  4. 0 0.0 1.0 2.0 3.0 0
  5. 1 4.0 5.0 6.0 7.0 0
  6. 2 8.0 9.0 10.0 11.0 0



  1. In [175]: arr = np.arange(12.).reshape((3, 4))
  2. In [176]: arr
  3. Out[176]:
  4. array([[ 0., 1., 2., 3.],
  5. [ 4., 5., 6., 7.],
  6. [ 8., 9., 10., 11.]])
  7. In [177]: arr[0]
  8. Out[177]: array([ 0., 1., 2., 3.])
  9. In [178]: arr - arr[0]
  10. Out[178]:
  11. array([[ 0., 0., 0., 0.],
  12. [ 4., 4., 4., 4.],
  13. [ 8., 8., 8., 8.]])


  1. In [179]: frame = pd.DataFrame(np.arange(12.).reshape((4, 3)),
  2. .....: columns=list('bde'),
  3. .....: index=['Utah', 'Ohio', 'Texas', 'Oregon'])
  4. In [180]: series = frame.iloc[0]
  5. In [181]: frame
  6. Out[181]:
  7. b d e
  8. Utah 0.0 1.0 2.0
  9. Ohio 3.0 4.0 5.0
  10. Texas 6.0 7.0 8.0
  11. Oregon 9.0 10.0 11.0
  12. In [182]: series
  13. Out[182]:
  14. b 0.0
  15. d 1.0
  16. e 2.0
  17. Name: Utah, dtype: float64


  1. In [183]: frame - series
  2. Out[183]:
  3. b d e
  4. Utah 0.0 0.0 0.0
  5. Ohio 3.0 3.0 3.0
  6. Texas 6.0 6.0 6.0
  7. Oregon 9.0 9.0 9.0


  1. In [184]: series2 = pd.Series(range(3), index=['b', 'e', 'f'])
  2. In [185]: frame + series2
  3. Out[185]:
  4. b d e f
  5. Utah 0.0 NaN 3.0 NaN
  6. Ohio 3.0 NaN 6.0 NaN
  7. Texas 6.0 NaN 9.0 NaN
  8. Oregon 9.0 NaN 12.0 NaN


  1. In [186]: series3 = frame['d']
  2. In [187]: frame
  3. Out[187]:
  4. b d e
  5. Utah 0.0 1.0 2.0
  6. Ohio 3.0 4.0 5.0
  7. Texas 6.0 7.0 8.0
  8. Oregon 9.0 10.0 11.0
  9. In [188]: series3
  10. Out[188]:
  11. Utah 1.0
  12. Ohio 4.0
  13. Texas 7.0
  14. Oregon 10.0
  15. Name: d, dtype: float64
  16. In [189]: frame.sub(series3, axis='index')
  17. Out[189]:
  18. b d e
  19. Utah -1.0 0.0 1.0
  20. Ohio -1.0 0.0 1.0
  21. Texas -1.0 0.0 1.0
  22. Oregon -1.0 0.0 1.0

传入的轴号就是希望匹配的轴。在本例中,我们的目的是匹配DataFrame的行索引(axis=’index’ or axis=0)并进行广播。



  1. In [190]: frame = pd.DataFrame(np.random.randn(4, 3), columns=list('bde'),
  2. .....: index=['Utah', 'Ohio', 'Texas', 'Oregon'])
  3. In [191]: frame
  4. Out[191]:
  5. b d e
  6. Utah -0.204708 0.478943 -0.519439
  7. Ohio -0.555730 1.965781 1.393406
  8. Texas 0.092908 0.281746 0.769023
  9. Oregon 1.246435 1.007189 -1.296221
  10. In [192]: np.abs(frame)
  11. Out[192]:
  12. b d e
  13. Utah 0.204708 0.478943 0.519439
  14. Ohio 0.555730 1.965781 1.393406
  15. Texas 0.092908 0.281746 0.769023
  16. Oregon 1.246435 1.007189 1.296221


  1. In [193]: f = lambda x: x.max() - x.min()
  2. In [194]: frame.apply(f)
  3. Out[194]:
  4. b 1.802165
  5. d 1.684034
  6. e 2.689627
  7. dtype: float64



  1. In [195]: frame.apply(f, axis='columns')
  2. Out[195]:
  3. Utah 0.998382
  4. Ohio 2.521511
  5. Texas 0.676115
  6. Oregon 2.542656
  7. dtype: float64



  1. In [196]: def f(x):
  2. .....: return pd.Series([x.min(), x.max()], index=['min', 'max'])
  3. In [197]: frame.apply(f)
  4. Out[197]:
  5. b d e
  6. min -0.555730 0.281746 -1.296221
  7. max 1.246435 1.965781 1.393406


  1. In [198]: format = lambda x: '%.2f' % x
  2. In [199]: frame.applymap(format)
  3. Out[199]:
  4. b d e
  5. Utah -0.20 0.48 -0.52
  6. Ohio -0.56 1.97 1.39
  7. Texas 0.09 0.28 0.77
  8. Oregon 1.25 1.01 -1.30


  1. In [200]: frame['e'].map(format)
  2. Out[200]:
  3. Utah -0.52
  4. Ohio 1.39
  5. Texas 0.77
  6. Oregon -1.30
  7. Name: e, dtype: object



  1. In [201]: obj = pd.Series(range(4), index=['d', 'a', 'b', 'c'])
  2. In [202]: obj.sort_index()
  3. Out[202]:
  4. a 1
  5. b 2
  6. c 3
  7. d 0
  8. dtype: int64


  1. In [203]: frame = pd.DataFrame(np.arange(8).reshape((2, 4)),
  2. .....: index=['three', 'one'],
  3. .....: columns=['d', 'a', 'b', 'c'])
  4. In [204]: frame.sort_index()
  5. Out[204]:
  6. d a b c
  7. one 4 5 6 7
  8. three 0 1 2 3
  9. In [205]: frame.sort_index(axis=1)
  10. Out[205]:
  11. a b c d
  12. three 1 2 3 0
  13. one 5 6 7 4


  1. In [206]: frame.sort_index(axis=1, ascending=False)
  2. Out[206]:
  3. d c b a
  4. three 0 3 2 1
  5. one 4 7 6 5


  1. In [207]: obj = pd.Series([4, 7, -3, 2])
  2. In [208]: obj.sort_values()
  3. Out[208]:
  4. 2 -3
  5. 3 2
  6. 0 4
  7. 1 7
  8. dtype: int64


  1. In [209]: obj = pd.Series([4, np.nan, 7, np.nan, -3, 2])
  2. In [210]: obj.sort_values()
  3. Out[210]:
  4. 4 -3.0
  5. 5 2.0
  6. 0 4.0
  7. 2 7.0
  8. 1 NaN
  9. 3 NaN
  10. dtype: float64


  1. In [211]: frame = pd.DataFrame({'b': [4, 7, -3, 2], 'a': [0, 1, 0, 1]})
  2. In [212]: frame
  3. Out[212]:
  4. a b
  5. 0 0 4
  6. 1 1 7
  7. 2 0 -3
  8. 3 1 2
  9. In [213]: frame.sort_values(by='b')
  10. Out[213]:
  11. a b
  12. 2 0 -3
  13. 3 1 2
  14. 0 0 4
  15. 1 1 7


  1. In [214]: frame.sort_values(by=['a', 'b'])
  2. Out[214]:
  3. a b
  4. 2 0 -3
  5. 0 0 4
  6. 3 1 2
  7. 1 1 7


  1. In [215]: obj = pd.Series([7, -5, 7, 4, 2, 0, 4])
  2. In [216]: obj.rank()
  3. Out[216]:
  4. 0 6.5
  5. 1 1.0
  6. 2 6.5
  7. 3 4.5
  8. 4 3.0
  9. 5 2.0
  10. 6 4.5
  11. dtype: float64


  1. In [217]: obj.rank(method='first')
  2. Out[217]:
  3. 0 6.0
  4. 1 1.0
  5. 2 7.0
  6. 3 4.0
  7. 4 3.0
  8. 5 2.0
  9. 6 5.0
  10. dtype: float64



  1. # Assign tie values the maximum rank in the group
  2. In [218]: obj.rank(ascending=False, method='max')
  3. Out[218]:
  4. 0 2.0
  5. 1 7.0
  6. 2 2.0
  7. 3 4.0
  8. 4 5.0
  9. 5 6.0
  10. 6 4.0
  11. dtype: float64


  1. In [219]: frame = pd.DataFrame({'b': [4.3, 7, -3, 2], 'a': [0, 1, 0, 1],
  2. .....: 'c': [-2, 5, 8, -2.5]})
  3. In [220]: frame
  4. Out[220]:
  5. a b c
  6. 0 0 4.3 -2.0
  7. 1 1 7.0 5.0
  8. 2 0 -3.0 8.0
  9. 3 1 2.0 -2.5
  10. In [221]: frame.rank(axis='columns')
  11. Out[221]:
  12. a b c
  13. 0 2.0 3.0 1.0
  14. 1 1.0 3.0 2.0
  15. 2 2.0 1.0 3.0
  16. 3 2.0 3.0 1.0

表5-6 排名时用于破坏平级关系的方法



  1. In [222]: obj = pd.Series(range(5), index=['a', 'a', 'b', 'b', 'c'])
  2. In [223]: obj
  3. Out[223]:
  4. a 0
  5. a 1
  6. b 2
  7. b 3
  8. c 4
  9. dtype: int64


  1. In [224]: obj.index.is_unique
  2. Out[224]: False


  1. In [225]: obj['a']
  2. Out[225]:
  3. a 0
  4. a 1
  5. dtype: int64
  6. In [226]: obj['c']
  7. Out[226]: 4



  1. In [227]: df = pd.DataFrame(np.random.randn(4, 3), index=['a', 'a', 'b', 'b'])
  2. In [228]: df
  3. Out[228]:
  4. 0 1 2
  5. a 0.274992 0.228913 1.352917
  6. a 0.886429 -2.001637 -0.371843
  7. b 1.669025 -0.438570 -0.539741
  8. b 0.476985 3.248944 -1.021228
  9. In [229]: df.loc['b']
  10. Out[229]:
  11. 0 1 2
  12. b 1.669025 -0.438570 -0.539741
  13. b 0.476985 3.248944 -1.021228