7. 用链式方法重现idxmax
# 和前面一样,只选出数值列
In[76]: college = pd.read_csv('data/college.csv', index_col='INSTNM')
cols = ['MD_EARN_WNE_P10', 'GRAD_DEBT_MDN_SUPP']
for col in cols:
college[col] = pd.to_numeric(college[col], errors='coerce')
college_n = college.select_dtypes(include=[np.number])
criteria = college_n.nunique() == 2
binary_cols = college_n.columns[criteria].tolist()
college_n = college_n.drop(labels=binary_cols, axis='columns')
In[77]: college_n.max().head()
Out[77]: SATVRMID 765.0
SATMTMID 785.0
UGDS 151558.0
UGDS_WHITE 1.0
UGDS_BLACK 1.0
dtype: float64
# college_n.max()可以选出每列的最大值,用eq方法比较DataFrame的每个值和该列的最大值
In[78]: college_n.eq(college_n.max()).head()
Out[78]:
# 用any方法,选出至少包含一个True值的行
In[79]: has_row_max = college_n.eq(college_n.max()).any(axis='columns')
has_row_max.head()
Out[79]: INSTNM
Alabama A & M University False
University of Alabama at Birmingham False
Amridge University False
University of Alabama in Huntsville False
Alabama State University False
dtype: bool
# 因为只有18列,has_row_max最多只能有18个True,来看下实际共有多少个
In[80]: college_n.shape
Out[80]: (7535, 18)
In[81]: has_row_max.sum()
Out[81]: 401
# 结果很奇怪,这是因为许多百分比的列的最大值是1。转而使用cumsum()累积求和
In[82]: has_row_max.sum()
In[83]: college_n.eq(college_n.max()).cumsum()
Out[83]:
# 一些列只有一个最大值,比如SATVRMID和SATMTMID,UGDS_WHITE列却有许多最大值。有109所学校的学生100%是白人。如果再使用一次cunsum,1在每列中就只出现一次,而且会是最大值首次出现的位置:
>>> college_n.eq(college_n.max()).cumsum().cumsum()
# 现在就可以用eq方法去和1进行比较,然后用any方法,选出所有至少包含一个True值的行
In[84]: has_row_max2 = college_n.eq(college_n.max())\
.cumsum()\
.cumsum()\
.eq(1)\
.any(axis='columns')
has_row_max2.head()
Out[84]: INSTNM
Alabama A & M University False
University of Alabama at Birmingham False
Amridge University False
University of Alabama in Huntsville False
Alabama State University False
dtype: bool
# 查看有多少True值
In[85]: has_row_max2.sum()
Out[85]: 16
# 直接通过布尔索引选出这些学校
In[86]: idxmax_cols = has_row_max2[has_row_max2].index
idxmax_cols
Out[86]: Index(['Thunderbird School of Global Management',
'Southwest University of Visual Arts-Tucson', 'ABC Beauty College Inc',
'Velvatex College of Beauty Culture',
'California Institute of Technology',
'Le Cordon Bleu College of Culinary Arts-San Francisco',
'MTI Business College Inc', 'Dongguk University-Los Angeles',
'Mr Leon's School of Hair Design-Moscow',
'Haskell Indian Nations University', 'LIU Brentwood',
'Medical College of Wisconsin', 'Palau Community College',
'California University of Management and Sciences',
'Cosmopolitan Beauty and Tech School', 'University of Phoenix-Arizona'],
dtype='object', name='INSTNM')
# 和idxmax方法的结果比较
In[87]: set(college_n.idxmax().unique()) == set(idxmax_cols)
Out[87]: True
更多
# 耗时比较
In[88]: %timeit college_n.idxmax().values
1.11 ms ± 50.9 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
Out[89]: %timeit college_n.eq(college_n.max())\
.cumsum()\
.cumsum()\
.eq(1)\
.any(axis='columns')\
[lambda x: x].index
5.26 ms ± 35.6 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)