1. 规划数据分析路线
# 读取查看数据
In[2]: college = pd.read_csv('data/college.csv')
In[3]: college.head()
Out[3]:
# 数据的行数与列数
In[4]: college.shape
Out[4]: (7535, 27)
# 统计数值列,并进行转置
In[5]: with pd.option_context('display.max_rows', 8):
display(college.describe(include=[np.number]).T)
Out[5]:
# 统计对象和类型列
In[6]: college.describe(include=[np.object, pd.Categorical]).T
Out[6]:
# 列出每列的数据类型,非缺失值的数量,以及内存的使用
In[7]: college.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7535 entries, 0 to 7534
Data columns (total 27 columns):
INSTNM 7535 non-null object
CITY 7535 non-null object
STABBR 7535 non-null object
HBCU 7164 non-null float64
MENONLY 7164 non-null float64
WOMENONLY 7164 non-null float64
RELAFFIL 7535 non-null int64
SATVRMID 1185 non-null float64
SATMTMID 1196 non-null float64
DISTANCEONLY 7164 non-null float64
UGDS 6874 non-null float64
UGDS_WHITE 6874 non-null float64
UGDS_BLACK 6874 non-null float64
UGDS_HISP 6874 non-null float64
UGDS_ASIAN 6874 non-null float64
UGDS_AIAN 6874 non-null float64
UGDS_NHPI 6874 non-null float64
UGDS_2MOR 6874 non-null float64
UGDS_NRA 6874 non-null float64
UGDS_UNKN 6874 non-null float64
PPTUG_EF 6853 non-null float64
CURROPER 7535 non-null int64
PCTPELL 6849 non-null float64
PCTFLOAN 6849 non-null float64
UG25ABV 6718 non-null float64
MD_EARN_WNE_P10 6413 non-null object
GRAD_DEBT_MDN_SUPP 7503 non-null object
dtypes: float64(20), int64(2), object(5)
memory usage: 1.6+ MB
# 重复了,但没设置最大行数
In[8]: college.describe(include=[np.number]).T
Out[8]:
# 和前面重复了
In[9]: college.describe(include=[np.object, pd.Categorical]).T
Out[9]:
更多
# 在describe方法中,打印分位数
In[10]: with pd.option_context('display.max_rows', 5):
display(college.describe(include=[np.number],
percentiles=[.01, .05, .10, .25, .5, .75, .9, .95, .99]).T)
# 展示一个数据字典:数据字典的主要作用是解释列名的意义
In[11]: college_dd = pd.read_csv('data/college_data_dictionary.csv')
In[12]: with pd.option_context('display.max_rows', 8):
display(college_dd)