6. 高亮每列的最大值
In[61]: pd.options.display.max_rows = 8
# 读取college数据集,INSTNM作为列
In[62]: college = pd.read_csv('data/college.csv', index_col='INSTNM')
college.dtypes
Out[62]: CITY object
STABBR object
HBCU float64
MENONLY float64
...
PCTFLOAN float64
UG25ABV float64
MD_EARN_WNE_P10 object
GRAD_DEBT_MDN_SUPP object
Length: 26, dtype: object
# MD_EARN_WNE_P10 和 GRAD_DEBT_MDN_SUPP 两列是对象类型,对其进行检查,发现含有字符串
In[63]: college.MD_EARN_WNE_P10.iloc[0]
Out[63]: '30300'
In[64]: college.MD_EARN_WNE_P10.iloc[0]
Out[64]: '30300'
# 降序检查
In[65]: college.MD_EARN_WNE_P10.sort_values(ascending=False).head()
Out[65]: INSTNM
Sharon Regional Health System School of Nursing PrivacySuppressed
Northcoast Medical Training Academy PrivacySuppressed
Success Schools PrivacySuppressed
Louisiana Culinary Institute PrivacySuppressed
Bais Medrash Toras Chesed PrivacySuppressed
Name: MD_EARN_WNE_P10, dtype: object
# 可以用to_numeric,将某列的值做强制转换
In[66]: cols = ['MD_EARN_WNE_P10', 'GRAD_DEBT_MDN_SUPP']
for col in cols:
college[col] = pd.to_numeric(college[col], errors='coerce')
college.dtypes.loc[cols]
Out[66]: MD_EARN_WNE_P10 float64
GRAD_DEBT_MDN_SUPP float64
dtype: object
# 用select_dtypes方法过滤出数值列
In[67]: college_n = college.select_dtypes(include=[np.number])
college_n.head()
Out[67]:
# 有的列只含有两个值,用nunique()方法挑出这些列
In[68]: criteria = college_n.nunique() == 2
criteria.head()
Out[68]: HBCU True
MENONLY True
WOMENONLY True
RELAFFIL True
SATVRMID False
dtype: bool
# 将布尔Series传给索引运算符,生成二元列的列表
In[69]: binary_cols = college_n.columns[criteria].tolist()
binary_cols
Out[69]: ['HBCU', 'MENONLY', 'WOMENONLY', 'RELAFFIL', 'DISTANCEONLY', 'CURROPER']
# 用drop方法删除这些列
In[70]: college_n2 = college_n.drop(labels=binary_cols, axis='columns')
college_n2.head()
Out[70]:
# 用idxmax方法选出每列最大值的行索引标签
In[71]: max_cols = college_n2.idxmax()
max_cols
Out[71]: SATVRMID California Institute of Technology
SATMTMID California Institute of Technology
UGDS University of Phoenix-Arizona
UGDS_WHITE Mr Leon's School of Hair Design-Moscow
...
PCTFLOAN ABC Beauty College Inc
UG25ABV Dongguk University-Los Angeles
MD_EARN_WNE_P10 Medical College of Wisconsin
GRAD_DEBT_MDN_SUPP Southwest University of Visual Arts-Tucson
Length: 18, dtype: object
# 用unique()方法选出所有不重复的列名
In[72]: unique_max_cols = max_cols.unique()
unique_max_cols[:5]
Out[72]: array(['California Institute of Technology',
'University of Phoenix-Arizona',
"Mr Leon's School of Hair Design-Moscow",
'Velvatex College of Beauty Culture',
'Thunderbird School of Global Management'], dtype=object)
# 用max_cols选出只包含最大值的行,用style的highlight_max()高亮
In[73]: college_n2.loc[unique_max_cols].style.highlight_max()
Out[73]:
更多
# 用axis参数可以高亮每行的最大值
In[74]: college = pd.read_csv('data/college.csv', index_col='INSTNM')
college_ugds = college.filter(like='UGDS_').head()
college_ugds.style.highlight_max(axis='columns')
Out[74]:
In[75]: pd.Timedelta(1, unit='Y')
Out[75]: Timedelta('365 days 05:49:12')