1. DataFrame添加新的行
# 读取names数据集
In[2]: names = pd.read_csv('data/names.csv')
names
Out[2]:
# 用loc直接赋值新的行
In[3]: new_data_list = ['Aria', 1]
names.loc[4] = new_data_list
names
Out[3]:
# 用loc的标签直接赋值新的行
In[4]: names.loc['five'] = ['Zach', 3]
names
Out[4]:
# 也可以用字典赋值新行
In[5]: names.loc[len(names)] = {'Name':'Zayd', 'Age':2}
names
Out[5]:
In[6]: names
Out[6]:
# 字典可以打乱列名的顺序
In[7]: names.loc[len(names)] = pd.Series({'Age':32, 'Name':'Dean'})
names
Out[7]:
# 直接append一个字典
In[8]: names = pd.read_csv('data/names.csv')
names.append({'Name':'Aria', 'Age':1})
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-8-562aecc73587> in <module>()
1 # Use append with fresh copy of names
2 names = pd.read_csv('data/names.csv')
----> 3 names.append({'Name':'Aria', 'Age':1})
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/frame.py in append(self, other, ignore_index, verify_integrity)
4515 other = Series(other)
4516 if other.name is None and not ignore_index:
-> 4517 raise TypeError('Can only append a Series if ignore_index=True'
4518 ' or if the Series has a name')
4519
TypeError: Can only append a Series if ignore_index=True or if the Series has a name
# 按照错误提示,加上ignore_index=True
In[9]: names.append({'Name':'Aria', 'Age':1}, ignore_index=True)
Out[9]:
# 设定行索引
In[10]: names.index = ['Canada', 'Canada', 'USA', 'USA']
names
Out[10]:
# 添加一行
In[11]: names.append({'Name':'Aria', 'Age':1}, ignore_index=True)
Out[11]:
# 创建一个Series对象
In[12]: s = pd.Series({'Name': 'Zach', 'Age': 3}, name=len(names))
s
Out[12]: Age 3
Name Zach
Name: 4, dtype: object
# append方法可以将DataFrame和Series相连
In[13]: names.append(s)
Out[13]:
# append方法可以同时连接多行,只要将对象放到列表中
In[14]: s1 = pd.Series({'Name': 'Zach', 'Age': 3}, name=len(names))
s2 = pd.Series({'Name': 'Zayd', 'Age': 2}, name='USA')
names.append([s1, s2])
Out[14]:
# 读取baseball16数据集
In[15]: bball_16 = pd.read_csv('data/baseball16.csv')
bball_16.head()
Out[15]:
# 选取一行,并将其转换为字典
In[16]: data_dict = bball_16.iloc[0].to_dict()
print(data_dict)
{'playerID': 'altuvjo01', 'yearID': 2016, 'stint': 1, 'teamID': 'HOU', 'lgID': 'AL', 'G': 161, 'AB': 640, 'R': 108, 'H': 216, '2B': 42, '3B': 5, 'HR': 24, 'RBI': 96.0, 'SB': 30.0, 'CS': 10.0, 'BB': 60, 'SO': 70.0, 'IBB': 11.0, 'HBP': 7.0, 'SH': 3.0, 'SF': 7.0, 'GIDP': 15.0}
# 对这个字典做格式处理,如果是字符串则为空,否则为缺失值
In[17]: new_data_dict = {k: '' if isinstance(v, str) else np.nan for k, v in data_dict.items()}
print(new_data_dict)
{'playerID': '', 'yearID': nan, 'stint': nan, 'teamID': '', 'lgID': '', 'G': nan, 'AB': nan, 'R': nan, 'H': nan, '2B': nan, '3B': nan, 'HR': nan, 'RBI': nan, 'SB': nan, 'CS': nan, 'BB': nan, 'SO': nan, 'IBB': nan, 'HBP': nan, 'SH': nan, 'SF': nan, 'GIDP': nan}
更多
# 将一行数据添加到DataFrame是非常消耗资源的,不能通过循环的方法来做。下面是创建一千行的新数据,用作Series的列表:
In[18]: random_data = []
for i in range(1000):
d = dict()
for k, v in data_dict.items():
if isinstance(v, str):
d[k] = np.random.choice(list('abcde'))
else:
d[k] = np.random.randint(10)
random_data.append(pd.Series(d, name=i + len(bball_16)))
random_data[0].head()
Out[18]: 2B 2
3B 6
AB 8
BB 2
CS 0
Name: 16, dtype: object
# 给上面的append操作计时,1000行的数据用了5秒钟
In[19]: %%timeit
bball_16_copy = bball_16.copy()
for row in random_data:
bball_16_copy = bball_16_copy.append(row)
5.36 s ± 298 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
# 如果是通过列表的方式append,可以大大节省时间
In[20]: %%timeit
bball_16_copy = bball_16.copy()
bball_16_copy = bball_16_copy.append(random_data)
86.2 ms ± 3.71 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)