Replicating Propubliuca’s COMPAS Audit#
import numpy as np
import pandas as pd
import scipy
import matplotlib.pyplot as plt
import seaborn as sns
import itertools
from sklearn.metrics import roc_curve
import warnings
warnings.filterwarnings('ignore')
clean_data_url = 'https://raw.githubusercontent.com/ml4sts/outreach-compas/main/data/compas_c.csv'
df= pd.read_csv(clean_data_url).set_index('id')
df.head()
age | c_charge_degree | race | age_cat | score_text | sex | priors_count | days_b_screening_arrest | decile_score | is_recid | two_year_recid | c_jail_in | c_jail_out | length_of_stay | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
id | ||||||||||||||
3 | 34 | F | African-American | 25 - 45 | Low | Male | 0 | -1.0 | 3 | 1 | 1 | 2013-01-26 03:45:27 | 2013-02-05 05:36:53 | 10 |
4 | 24 | F | African-American | Less than 25 | Low | Male | 4 | -1.0 | 4 | 1 | 1 | 2013-04-13 04:58:34 | 2013-04-14 07:02:04 | 1 |
8 | 41 | F | Caucasian | 25 - 45 | Medium | Male | 14 | -1.0 | 6 | 1 | 1 | 2014-02-18 05:08:24 | 2014-02-24 12:18:30 | 6 |
10 | 39 | M | Caucasian | 25 - 45 | Low | Female | 0 | -1.0 | 1 | 0 | 0 | 2014-03-15 05:35:34 | 2014-03-18 04:28:46 | 2 |
14 | 27 | F | Caucasian | 25 - 45 | Low | Male | 0 | -1.0 | 4 | 0 | 0 | 2013-11-25 06:31:06 | 2013-11-26 08:26:57 | 1 |
pd.read_csv(clean_data_url).head()
id | age | c_charge_degree | race | age_cat | score_text | sex | priors_count | days_b_screening_arrest | decile_score | is_recid | two_year_recid | c_jail_in | c_jail_out | length_of_stay | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 3 | 34 | F | African-American | 25 - 45 | Low | Male | 0 | -1.0 | 3 | 1 | 1 | 2013-01-26 03:45:27 | 2013-02-05 05:36:53 | 10 |
1 | 4 | 24 | F | African-American | Less than 25 | Low | Male | 4 | -1.0 | 4 | 1 | 1 | 2013-04-13 04:58:34 | 2013-04-14 07:02:04 | 1 |
2 | 8 | 41 | F | Caucasian | 25 - 45 | Medium | Male | 14 | -1.0 | 6 | 1 | 1 | 2014-02-18 05:08:24 | 2014-02-24 12:18:30 | 6 |
3 | 10 | 39 | M | Caucasian | 25 - 45 | Low | Female | 0 | -1.0 | 1 | 0 | 0 | 2014-03-15 05:35:34 | 2014-03-18 04:28:46 | 2 |
4 | 14 | 27 | F | Caucasian | 25 - 45 | Low | Male | 0 | -1.0 | 4 | 0 | 0 | 2013-11-25 06:31:06 | 2013-11-26 08:26:57 | 1 |
df['decile_score'].mean()
4.6227737779461915
df.groupby('race')['decile_score'].mean()
race
African-American 5.276850
Caucasian 3.635283
Name: decile_score, dtype: float64
race_score_counts= df.groupby('race')['decile_score'].value_counts().unstack()
race_score_counts.T.plot(kind= 'bar',figsize= (12,7))
<Axes: xlabel='decile_score'>
![_images/66daf173d12f2ae84211fc61bcd346ca41d937327291c85f91574b4fdb4601b3.png](_images/66daf173d12f2ae84211fc61bcd346ca41d937327291c85f91574b4fdb4601b3.png)
race_score_counts.T
race | African-American | Caucasian |
---|---|---|
decile_score | ||
1 | 365 | 605 |
2 | 346 | 321 |
3 | 298 | 238 |
4 | 337 | 243 |
5 | 323 | 200 |
6 | 318 | 160 |
7 | 343 | 113 |
8 | 301 | 96 |
9 | 317 | 77 |
10 | 227 | 50 |
race_priors_counts = df.groupby('priors_count')['race'].value_counts().unstack()
race_priors_counts
race | African-American | Caucasian |
---|---|---|
priors_count | ||
0 | 846.0 | 821.0 |
1 | 543.0 | 410.0 |
2 | 325.0 | 258.0 |
3 | 247.0 | 170.0 |
4 | 185.0 | 102.0 |
5 | 161.0 | 84.0 |
6 | 118.0 | 55.0 |
7 | 116.0 | 41.0 |
8 | 101.0 | 29.0 |
9 | 80.0 | 38.0 |
10 | 64.0 | 22.0 |
11 | 55.0 | 17.0 |
12 | 43.0 | 8.0 |
13 | 48.0 | 11.0 |
14 | 37.0 | 7.0 |
15 | 36.0 | 5.0 |
16 | 25.0 | 4.0 |
17 | 21.0 | 5.0 |
18 | 15.0 | 2.0 |
19 | 21.0 | 1.0 |
20 | 14.0 | 2.0 |
21 | 17.0 | 1.0 |
22 | 13.0 | 2.0 |
23 | 11.0 | 1.0 |
24 | 9.0 | NaN |
25 | 8.0 | NaN |
26 | 2.0 | 1.0 |
27 | 5.0 | 1.0 |
28 | 3.0 | 2.0 |
29 | 2.0 | NaN |
30 | NaN | 1.0 |
31 | 1.0 | NaN |
33 | 1.0 | 1.0 |
36 | NaN | 1.0 |
37 | 1.0 | NaN |
38 | 1.0 | NaN |
race_priors_counts.plot(kind= 'bar',figsize=(12,7))
<Axes: xlabel='priors_count'>
![_images/dcbe14b44549d7816a8d1835df60e37308aa6f1d55a0dcf74db8f255285d2b23.png](_images/dcbe14b44549d7816a8d1835df60e37308aa6f1d55a0dcf74db8f255285d2b23.png)
df.groupby(['race','decile_score'])['two_year_recid'].mean().unstack().T.plot(kind= 'bar')
#If I was compas, I would WANT basically the numbers to be flipped (higher not arrest, lower re-arrest)
<Axes: xlabel='decile_score'>
![_images/ee32bbcb6bd74315677858527b82474b6bafbf4f3e32e110aa8369eb3e71d0b5.png](_images/ee32bbcb6bd74315677858527b82474b6bafbf4f3e32e110aa8369eb3e71d0b5.png)
dfQ = pd.read_csv('https://raw.githubusercontent.com/ml4sts/outreach-compas/main/data/compas_cq.csv')
dfQ.head(1)
id | age | c_charge_degree | race | age_cat | score_text | sex | priors_count | days_b_screening_arrest | decile_score | is_recid | two_year_recid | c_jail_in | c_jail_out | length_of_stay | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 3 | 34 | F | African-American | 25 to 45 | 0 | Male | 0 | -1.0 | 3 | 1 | 1 | 2013-01-26 03:45:27 | 2013-02-05 05:36:53 | <3months |
dfQ[['two_year_recid','score_text',]].corr()
two_year_recid | score_text | |
---|---|---|
two_year_recid | 1.000000 | 0.314698 |
score_text | 0.314698 | 1.000000 |
dfQ.groupby(['race','score_text'])[['two_year_recid']].mean()
two_year_recid | ||
---|---|---|
race | score_text | |
African-American | 0 | 0.351412 |
1 | 0.649535 | |
Caucasian | 0 | 0.289979 |
1 | 0.594828 |
race_counts_df = df.groupby(['race','decile_score'])['two_year_recid'].count().reset_index().rename(
columns={'two_year_recid':'count'})
race_counts_df
race | decile_score | count | |
---|---|---|---|
0 | African-American | 1 | 365 |
1 | African-American | 2 | 346 |
2 | African-American | 3 | 298 |
3 | African-American | 4 | 337 |
4 | African-American | 5 | 323 |
5 | African-American | 6 | 318 |
6 | African-American | 7 | 343 |
7 | African-American | 8 | 301 |
8 | African-American | 9 | 317 |
9 | African-American | 10 | 227 |
10 | Caucasian | 1 | 605 |
11 | Caucasian | 2 | 321 |
12 | Caucasian | 3 | 238 |
13 | Caucasian | 4 | 243 |
14 | Caucasian | 5 | 200 |
15 | Caucasian | 6 | 160 |
16 | Caucasian | 7 | 113 |
17 | Caucasian | 8 | 96 |
18 | Caucasian | 9 | 77 |
19 | Caucasian | 10 | 50 |
sns.set_theme(context='poster', style='darkgrid', palette='colorblind', font='sans-serif', font_scale=1, color_codes=True, rc=None)
sns.catplot(data=None, *, x=None, y=None, hue='race', col=None, col_wrap=None, estimator='mean', errorbar=('ci', 95), n_boot=1000, units=None, seed=None, order=None, hue_order=None, row_order=None, col_order=None, height=5, aspect=1, kind='strip', native_scale=False, formatter=None, orient=None, color=None, palette=None, hue_norm=None, legend='auto', legend_out=True, sharex=True, sharey=True, margin_titles=False, facet_kws=None, ci='deprecated', **kwargs)
Cell In[19], line 1
sns.catplot(data=None, *, x=None, y=None, hue='race', col=None, col_wrap=None, estimator='mean', errorbar=('ci', 95), n_boot=1000, units=None, seed=None, order=None, hue_order=None, row_order=None, col_order=None, height=5, aspect=1, kind='strip', native_scale=False, formatter=None, orient=None, color=None, palette=None, hue_norm=None, legend='auto', legend_out=True, sharex=True, sharey=True, margin_titles=False, facet_kws=None, ci='deprecated', **kwargs)
^
SyntaxError: iterable argument unpacking follows keyword argument unpacking
race_counts_df.head()
race | decile_score | count | |
---|---|---|---|
0 | African-American | 1 | 365 |
1 | African-American | 2 | 346 |
2 | African-American | 3 | 298 |
3 | African-American | 4 | 337 |
4 | African-American | 5 | 323 |
sns.catplot(data= race_counts_df, x='decile_score',
y= 'count',
kind= 'bar',
row= 'race',
hue= 'race',
aspect= 2)
<seaborn.axisgrid.FacetGrid at 0x7fc4ae3354a8>
![_images/bcce100389644814a8b7c7d874928f7bb57570030556c2d0b4f5a22b966ac244.png](_images/bcce100389644814a8b7c7d874928f7bb57570030556c2d0b4f5a22b966ac244.png)
race_priors_counts.plot(kind= 'bar',figsize=(12,7))
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
<ipython-input-3-7c7a5fac5bd5> in <module>
----> 1 race_priors_counts.plot(kind= 'bar',figsize=(12,7))
NameError: name 'race_priors_counts' is not defined