Portfolio Jensen Hu

Modeling the Occurrence of Stroke - Binary Classification with Python's Scikit Learn

# packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
---------------------------------------------------------------------------

ModuleNotFoundError                       Traceback (most recent call last)

<ipython-input-10-6d7c1745ed9f> in <module>
      4 import matplotlib.pyplot as plt
      5 import seaborn as sns
----> 6 import missingno as msno # missing data


ModuleNotFoundError: No module named 'missingno'

Dataset : Stroke Prediction Data Date: 2/6/2022 Shape: 5110 rows, 12 columns

# read stroke data
stroke = pd.read_csv("healthcare-dataset-stroke-data.csv")
stroke.head()
id gender age hypertension heart_disease ever_married work_type Residence_type avg_glucose_level bmi smoking_status stroke
0 9046 Male 67.0 0 1 Yes Private Urban 228.69 36.6 formerly smoked 1
1 51676 Female 61.0 0 0 Yes Self-employed Rural 202.21 NaN never smoked 1
2 31112 Male 80.0 0 1 Yes Private Rural 105.92 32.5 never smoked 1
3 60182 Female 49.0 0 0 Yes Private Urban 171.23 34.4 smokes 1
4 1665 Female 79.0 1 0 Yes Self-employed Rural 174.12 24.0 never smoked 1
stroke.shape
(5110, 12)
stroke.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   int64  
 1   gender             5110 non-null   object 
 2   age                5110 non-null   float64
 3   hypertension       5110 non-null   int64  
 4   heart_disease      5110 non-null   int64  
 5   ever_married       5110 non-null   object 
 6   work_type          5110 non-null   object 
 7   Residence_type     5110 non-null   object 
 8   avg_glucose_level  5110 non-null   float64
 9   bmi                4909 non-null   float64
 10  smoking_status     5110 non-null   object 
 11  stroke             5110 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 479.2+ KB

Notes:

  • id - shown as numeric, but should probably be binary obj
  • Hypertension and Heart Disease should character vars
  • Work type, Residence, Smoking_status are categorical
  • bmi has missing values, needs to be imputed or removed
#convert variables
stroke['id'] = stroke['id'].astype(str)
stroke.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   object 
 1   gender             5110 non-null   object 
 2   age                5110 non-null   float64
 3   hypertension       5110 non-null   int64  
 4   heart_disease      5110 non-null   int64  
 5   ever_married       5110 non-null   object 
 6   work_type          5110 non-null   object 
 7   Residence_type     5110 non-null   object 
 8   avg_glucose_level  5110 non-null   float64
 9   bmi                4909 non-null   float64
 10  smoking_status     5110 non-null   object 
 11  stroke             5110 non-null   int64  
dtypes: float64(3), int64(3), object(6)
memory usage: 479.2+ KB
# missing data 
sns.heatmap(stroke.isnull(), cbar=False)
<AxesSubplot:>

png

# review missing data (BMI)
stroke.loc[stroke['bmi'].isna(), 'bmi_missing'] = 1
stroke.loc[-stroke['bmi'].isna(), 'bmi_missing'] = 0

# check to see if missing data is correlated
corr_matrix = stroke.corr()
corr_matrix['bmi_missing'].sort_values(ascending = False)
bmi_missing          1.000000
stroke               0.141238
heart_disease        0.098621
hypertension         0.093046
avg_glucose_level    0.091957
age                  0.078956
bmi                       NaN
Name: bmi_missing, dtype: float64

stroke.hist(figsize = (12, 10))
plt.show()

png

stroke.columns
cat = ['gender', 'hypertension', 'heart_disease', 'ever_married',
       'work_type', 'Residence_type', 'smoking_status']
for c in cat:
    print(stroke[c].value_counts())
# train test split our dataset
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(stroke, test_size = 0.2, random_state = 42)
train_set.shape
test_set.shape
# explore data with a copy of the train set 
explore = train_set.copy()
# check out correlations
corr_matrix = explore.corr()
corr_matrix['stroke'].sort_values(ascending = False)
explore['hypertension'] = explore['hypertension'].astype(str)
explore['heart_disease'] = explore['heart_disease'].astype(str)
explore['stroke'] = explore['stroke'].astype(str)
explore.info()

from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy = "median")
explore_num = explore[['age', 'avg_glucose_level', 'stroke']]
imputer.fit(explore_num)
imputer.statistics_