6. 按工作日和年测量犯罪
# 读取crime数据集,将REPORTED_DATE作为一列
In[105]: crime = pd.read_hdf('data/crime.h5', 'crime')
crime.head()
Out[105]:
# 可以通过Timestamp的dt属性得到周几,然后统计
In[106]: wd_counts = crime['REPORTED_DATE'].dt.weekday_name.value_counts()
wd_counts
Out[106]: Monday 70024
Friday 69621
Wednesday 69538
Thursday 69287
Tuesday 68394
Saturday 58834
Sunday 55213
Name: REPORTED_DATE, dtype: int64
# 画一张水平柱状图
In[107]: days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday',
'Friday', 'Saturday', 'Sunday']
title = 'Denver Crimes and Traffic Accidents per Weekday'
wd_counts.reindex(days).plot(kind='barh', title=title)
Out[107]: <matplotlib.axes._subplots.AxesSubplot at 0x117e39e48>
# 相似的,也可以画出每年的水平柱状图
In[108]: title = 'Denver Crimes and Traffic Accidents per Year'
crime['REPORTED_DATE'].dt.year.value_counts() \
.sort_index() \
.plot(kind='barh', title=title)
Out[108]: <matplotlib.axes._subplots.AxesSubplot at 0x11b1c6d68>
# 将年和星期按两列分组聚合
In[109]: weekday = crime['REPORTED_DATE'].dt.weekday_name
year = crime['REPORTED_DATE'].dt.year
crime_wd_y = crime.groupby([year, weekday]).size()
crime_wd_y.head(10)
Out[109]: REPORTED_DATE REPORTED_DATE
2012 Friday 8549
Monday 8786
Saturday 7442
Sunday 7189
Thursday 8440
Tuesday 8191
Wednesday 8440
2013 Friday 10380
Monday 10627
Saturday 8875
dtype: int64
# 重命名索引名,然后对Weekday做unstack
In[110]: crime_table = crime_wd_y.rename_axis(['Year', 'Weekday']).unstack('Weekday')
crime_table
Out[110]:
# 找到数据中2017年的最后一天
In[111]: criteria = crime['REPORTED_DATE'].dt.year == 2017
crime.loc[criteria, 'REPORTED_DATE'].dt.dayofyear.max()
Out[111]: 272
# 计算这272天的平均犯罪率
In[112]: round(272 / 365, 3)
Out[112]: 0.745
In[113]: crime_pct = crime['REPORTED_DATE'].dt.dayofyear.le(272) \
.groupby(year) \
.mean() \
.round(3)
crime_pct
Out[113]: REPORTED_DATE
2012 0.748
2013 0.725
2014 0.751
2015 0.748
2016 0.752
2017 1.000
Name: REPORTED_DATE, dtype: float64
In[114]: crime_pct.loc[2012:2016].median()
Out[114]: 0.748
# 更新2017年的数据,并将星期排序
In[115]: crime_table.loc[2017] = crime_table.loc[2017].div(.748).astype('int')
crime_table = crime_table.reindex(columns=days)
crime_table
Out[115]:
# 用seaborn画热力图
In[116]: import seaborn as sns
sns.heatmap(crime_table, cmap='Greys')
Out[116]: <matplotlib.axes._subplots.AxesSubplot at 0x117a37ba8>
# 犯罪貌似每年都在增加,但这个数据没有考虑每年的新增人口。
# 读取丹佛市人口denver_pop数据集
In[117]: denver_pop = pd.read_csv('data/denver_pop.csv', index_col='Year')
denver_pop
Out[117]:
# 计算每10万人的犯罪率
In[118]: den_100k = denver_pop.div(100000).squeeze()
crime_table2 = crime_table.div(den_100k, axis='index').astype('int')
crime_table2
Out[118]:
# 再画一张热力图
In[119]: sns.heatmap(crime_table2, cmap='Greys')
Out[119]: <matplotlib.axes._subplots.AxesSubplot at 0x1203024e0>
原理
# loc接收一个排好序的列表,也可以实现reindex同样的功能
In[120]: wd_counts.loc[days]
Out[120]: Monday 70024
Tuesday 68394
Wednesday 69538
Thursday 69287
Friday 69621
Saturday 58834
Sunday 55213
Name: REPORTED_DATE, dtype: int64
# DataFrame和Series相除,会使用DataFrame的列和Series的行索引对齐
In[121]: crime_table / den_100k
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/indexes/base.py:3033: RuntimeWarning: '<' not supported between instances of 'str' and 'int', sort order is undefined for incomparable objects
return this.join(other, how=how, return_indexers=return_indexers)
Out[121]:
更多
# 将之前的操作打包成一个函数,并且可以根据犯罪类型筛选数据
In[122]: ADJ_2017 = .748
def count_crime(df, offense_cat):
df = df[df['OFFENSE_CATEGORY_ID'] == offense_cat]
weekday = df['REPORTED_DATE'].dt.weekday_name
year = df['REPORTED_DATE'].dt.year
ct = df.groupby([year, weekday]).size().unstack()
ct.loc[2017] = ct.loc[2017].div(ADJ_2017).astype('int')
pop = pd.read_csv('data/denver_pop.csv', index_col='Year')
pop = pop.squeeze().div(100000)
ct = ct.div(pop, axis=0).astype('int')
ct = ct.reindex(columns=days)
sns.heatmap(ct, cmap='Greys')
return ct
In[123]: count_crime(crime, 'auto-theft')
Out[123]: