4. 用不等索引填充数值
# 读取三个baseball数据集,行索引设为playerID
In[32]: baseball_14 = pd.read_csv('data/baseball14.csv', index_col='playerID')
baseball_15 = pd.read_csv('data/baseball15.csv', index_col='playerID')
baseball_16 = pd.read_csv('data/baseball16.csv', index_col='playerID')
baseball_14.head()
Out[32]:
# 用索引方法difference,找到哪些索引标签在baseball_14中,却不在baseball_15、baseball_16中
In[33]: baseball_14.index.difference(baseball_15.index)
Out[33]: Index(['corpoca01', 'dominma01', 'fowlede01', 'grossro01', 'guzmaje01',
'hoeslj01', 'krausma01', 'preslal01', 'singljo02'],
dtype='object', name='playerID')
In[34]: baseball_14.index.difference(baseball_16.index)
Out[34]: Index(['congeha01', 'correca01', 'gattiev01', 'gomezca01',
'lowrije01', 'rasmuco01', 'tuckepr01', 'valbulu01'],
dtype='object', name='playerID')
# 找到每名球员在过去三个赛季的击球数,H列包含了这个数据
In[35]: hits_14 = baseball_14['H']
hits_15 = baseball_15['H']
hits_16 = baseball_16['H']
hits_14.head()
Out[35]: Index(['corpoca01', 'dominma01', 'fowlede01', 'grossro01', 'guzmaje01',
'hoeslj01', 'krausma01', 'preslal01', 'singljo02'],
dtype='object', name='playerID')
# 将hits_14和hits_15两列相加
In[36]: (hits_14 + hits_15).head()
Out[36]: playerID
altuvjo01 425.0
cartech02 193.0
castrja01 174.0
congeha01 NaN
corpoca01 NaN
Name: H, dtype: float64
# congeha01 和 corpoca01 在2015年是有记录的,但是结果缺失了。使用add方法和参数fill_value,避免产生缺失值
In[37]: hits_14.add(hits_15, fill_value=0).head()
Out[37]: playerID
altuvjo01 425.0
cartech02 193.0
castrja01 174.0
congeha01 46.0
corpoca01 40.0
Name: H, dtype: float64
# 再将2016的数据也加上
In[38]: hits_total = hits_14.add(hits_15, fill_value=0).add(hits_16, fill_value=0)
hits_total.head()
Out[38]: playerID
altuvjo01 641.0
bregmal01 53.0
cartech02 193.0
castrja01 243.0
congeha01 46.0
Name: H, dtype: float64
# 检查结果中是否有缺失值
In[39]: hits_total.hasnans
Out[39]: False
原理
# 如果一个元素在两个Series都是缺失值,即便使用了fill_value,相加的结果也仍是缺失值
In[40]: s = pd.Series(index=['a', 'b', 'c', 'd'], data=[np.nan, 3, np.nan, 1])
s
Out[40]: a NaN
b 3.0
c NaN
d 1.0
dtype: float64
In[41]: s1 = pd.Series(index=['a', 'b', 'c'], data=[np.nan, 6, 10])
s1
Out[41]: a NaN
b 6.0
c 10.0
dtype: float64
In[42]: s.add(s1, fill_value=5)
Out[42]: a NaN
b 9.0
c 15.0
d 6.0
dtype: float64
In[43]: s1.add(s, fill_value=5)
Out[43]: a NaN
b 9.0
c 15.0
d 6.0
dtype: float64
更多
# 从baseball_14中选取一些列
In[44]: df_14 = baseball_14[['G','AB', 'R', 'H']]
df_14.head()
Out[44]:
# 再从baseball_15中选取一些列,有相同的、也有不同的
In[45]: df_15 = baseball_15[['AB', 'R', 'H', 'HR']]
df_15.head()
Out[45]:
# 将二者相加的话,只要行或列不能对齐,就会产生缺失值。style属性的highlight_null方法可以高亮缺失值
In[46]: (df_14 + df_15).head(10).style.highlight_null('yellow')
Out[46]:
# 即便使用了fill_value=0,有些值也会是缺失值,这是因为一些行和列的组合根本不存在输入的数据中
In[47]: df_14.add(df_15, fill_value=0).head(10).style.highlight_null('yellow')
Out[47]: