Modeling the Occurrence of Stroke - Binary Classification with Python's Scikit Learn
09 Feb 2022# packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
---------------------------------------------------------------------------
ModuleNotFoundError Traceback (most recent call last)
<ipython-input-10-6d7c1745ed9f> in <module>
4 import matplotlib.pyplot as plt
5 import seaborn as sns
----> 6 import missingno as msno # missing data
ModuleNotFoundError: No module named 'missingno'
Dataset : Stroke Prediction Data Date: 2/6/2022 Shape: 5110 rows, 12 columns
# read stroke data
stroke = pd.read_csv("healthcare-dataset-stroke-data.csv")
stroke.head()
id | gender | age | hypertension | heart_disease | ever_married | work_type | Residence_type | avg_glucose_level | bmi | smoking_status | stroke | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 9046 | Male | 67.0 | 0 | 1 | Yes | Private | Urban | 228.69 | 36.6 | formerly smoked | 1 |
1 | 51676 | Female | 61.0 | 0 | 0 | Yes | Self-employed | Rural | 202.21 | NaN | never smoked | 1 |
2 | 31112 | Male | 80.0 | 0 | 1 | Yes | Private | Rural | 105.92 | 32.5 | never smoked | 1 |
3 | 60182 | Female | 49.0 | 0 | 0 | Yes | Private | Urban | 171.23 | 34.4 | smokes | 1 |
4 | 1665 | Female | 79.0 | 1 | 0 | Yes | Self-employed | Rural | 174.12 | 24.0 | never smoked | 1 |
stroke.shape
(5110, 12)
stroke.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 id 5110 non-null int64
1 gender 5110 non-null object
2 age 5110 non-null float64
3 hypertension 5110 non-null int64
4 heart_disease 5110 non-null int64
5 ever_married 5110 non-null object
6 work_type 5110 non-null object
7 Residence_type 5110 non-null object
8 avg_glucose_level 5110 non-null float64
9 bmi 4909 non-null float64
10 smoking_status 5110 non-null object
11 stroke 5110 non-null int64
dtypes: float64(3), int64(4), object(5)
memory usage: 479.2+ KB
Notes:
- id - shown as numeric, but should probably be binary obj
- Hypertension and Heart Disease should character vars
- Work type, Residence, Smoking_status are categorical
- bmi has missing values, needs to be imputed or removed
#convert variables
stroke['id'] = stroke['id'].astype(str)
stroke.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 id 5110 non-null object
1 gender 5110 non-null object
2 age 5110 non-null float64
3 hypertension 5110 non-null int64
4 heart_disease 5110 non-null int64
5 ever_married 5110 non-null object
6 work_type 5110 non-null object
7 Residence_type 5110 non-null object
8 avg_glucose_level 5110 non-null float64
9 bmi 4909 non-null float64
10 smoking_status 5110 non-null object
11 stroke 5110 non-null int64
dtypes: float64(3), int64(3), object(6)
memory usage: 479.2+ KB
# missing data
sns.heatmap(stroke.isnull(), cbar=False)
<AxesSubplot:>
# review missing data (BMI)
stroke.loc[stroke['bmi'].isna(), 'bmi_missing'] = 1
stroke.loc[-stroke['bmi'].isna(), 'bmi_missing'] = 0
# check to see if missing data is correlated
corr_matrix = stroke.corr()
corr_matrix['bmi_missing'].sort_values(ascending = False)
bmi_missing 1.000000
stroke 0.141238
heart_disease 0.098621
hypertension 0.093046
avg_glucose_level 0.091957
age 0.078956
bmi NaN
Name: bmi_missing, dtype: float64
stroke.hist(figsize = (12, 10))
plt.show()
stroke.columns
cat = ['gender', 'hypertension', 'heart_disease', 'ever_married',
'work_type', 'Residence_type', 'smoking_status']
for c in cat:
print(stroke[c].value_counts())
# train test split our dataset
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(stroke, test_size = 0.2, random_state = 42)
train_set.shape
test_set.shape
# explore data with a copy of the train set
explore = train_set.copy()
# check out correlations
corr_matrix = explore.corr()
corr_matrix['stroke'].sort_values(ascending = False)
explore['hypertension'] = explore['hypertension'].astype(str)
explore['heart_disease'] = explore['heart_disease'].astype(str)
explore['stroke'] = explore['stroke'].astype(str)
explore.info()
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy = "median")
explore_num = explore[['age', 'avg_glucose_level', 'stroke']]
imputer.fit(explore_num)
imputer.statistics_