import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.metrics import mean_squared_error, accuracy_score, r2_score
from ucimlrepo import fetch_ucirepo
Introduction
In this week’s discussion section, we will use a data with few NAs and intentionally add more NAs to it. We are going to run different imputation strategies on our newly “NA-ed” dataset, and see which performs best. Normally, you would never know how your imputation is actually performing, but this excercise will allow us to look under the hood a bit at how different imputation strategies perform differently. Once we find which imputation strategy works best, we will run a random forest on both the original data, as well as our newly imputed data. Which do you think will perform better??
Data
This week, we will be working with mushroom data! This dataset from the UCI Machine Learning Repository includes descriptions of hypothetical samples corresponding to 23 species of gilled mushrooms in the Agaricus and Lepiota Family. Our target variable will be, poisonous
, a categorical outcome variable classifying the mushroom as poisonous or not. We will include 22 features in our dataset that all relate to mushroom characteristics- such as cap-cut
, cap-surface
, bruises
, and odor
.
Excercise
Import Libraries
Load Data
# Fetch dataset
= fetch_ucirepo(id=73)
mushroom
# Save data as X and y variables
= mushroom.data.features
X = np.ravel(mushroom.data.targets)
y
# Expand dataframe columns and look at view dataframe
'display.max_columns', None)
pd.set_option( X.head()
cap-shape | cap-surface | cap-color | bruises | odor | gill-attachment | gill-spacing | gill-size | gill-color | stalk-shape | stalk-root | stalk-surface-above-ring | stalk-surface-below-ring | stalk-color-above-ring | stalk-color-below-ring | veil-type | veil-color | ring-number | ring-type | spore-print-color | population | habitat | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | x | s | n | t | p | f | c | n | k | e | e | s | s | w | w | p | w | o | p | k | s | u |
1 | x | s | y | t | a | f | c | b | k | e | c | s | s | w | w | p | w | o | p | n | n | g |
2 | b | s | w | t | l | f | c | b | n | e | c | s | s | w | w | p | w | o | p | n | n | m |
3 | x | y | w | t | p | f | c | n | n | e | e | s | s | w | w | p | w | o | p | k | s | u |
4 | x | s | g | f | n | f | w | b | k | t | e | s | s | w | w | p | w | o | e | n | a | g |
Encoding Data
# Factorize all columns
for col in X.columns:
= pd.factorize(X[col], sort = True)[0]
X.loc[:, col]
# View first few rows of encoded data
0:5, 0:5] X.iloc[
cap-shape | cap-surface | cap-color | bruises | odor | |
---|---|---|---|---|---|
0 | 5 | 2 | 4 | 1 | 6 |
1 | 5 | 2 | 9 | 1 | 0 |
2 | 0 | 2 | 8 | 1 | 3 |
3 | 5 | 3 | 8 | 1 | 6 |
4 | 5 | 2 | 3 | 0 | 5 |
Time to impute!
Does our dataset have any missing values? Lets check!
# Check for NAs
sum() X.isna().
cap-shape 0
cap-surface 0
cap-color 0
bruises 0
odor 0
gill-attachment 0
gill-spacing 0
gill-size 0
gill-color 0
stalk-shape 0
stalk-root 0
stalk-surface-above-ring 0
stalk-surface-below-ring 0
stalk-color-above-ring 0
stalk-color-below-ring 0
veil-type 0
veil-color 0
ring-number 0
ring-type 0
spore-print-color 0
population 0
habitat 0
dtype: int64
We are going to randomly assign observations in our dataset to be missing, and then see which imputation methods perform best by comparing their results to our actual dataset. Let’s randomly assign NA observations throughout our dataset. We will create a copy of our dataframe and call it X_Na
.
# Create copy of X variables
= X.copy() X_Na
# Assign 10% of new dataframe with NA values
for col in X_Na.columns:
= 0.1).index, col] = np.nan X_Na.loc[X_Na.sample(frac
# Check to make sure there are missing values
sum() X_Na.isna().
cap-shape 812
cap-surface 812
cap-color 812
bruises 812
odor 812
gill-attachment 812
gill-spacing 812
gill-size 812
gill-color 812
stalk-shape 812
stalk-root 812
stalk-surface-above-ring 812
stalk-surface-below-ring 812
stalk-color-above-ring 812
stalk-color-below-ring 812
veil-type 812
veil-color 812
ring-number 812
ring-type 812
spore-print-color 812
population 812
habitat 812
dtype: int64
Now that we have our dataset with missing values, let’s impute!
Imputation method #1: Filling NA values with the mode
# Impute with mode
= X_Na.fillna(X_Na.mode().iloc[0])
X_mode_impute
# Check to make sure there are no NAs
sum() X_mode_impute.isna().
cap-shape 0
cap-surface 0
cap-color 0
bruises 0
odor 0
gill-attachment 0
gill-spacing 0
gill-size 0
gill-color 0
stalk-shape 0
stalk-root 0
stalk-surface-above-ring 0
stalk-surface-below-ring 0
stalk-color-above-ring 0
stalk-color-below-ring 0
veil-type 0
veil-color 0
ring-number 0
ring-type 0
spore-print-color 0
population 0
habitat 0
dtype: int64
Imputation method #2: Filling NA values with the median using SimpleImputer
# Impute with median (using SimpleImputer)
= SimpleImputer(strategy = 'median')
median_impute = median_impute.fit_transform(X_Na)
X_median_impute = pd.DataFrame(X_median_impute, columns = X.columns)
X_median_impute
# Check to make sure there are no NAs
sum() X_median_impute.isna().
cap-shape 0
cap-surface 0
cap-color 0
bruises 0
odor 0
gill-attachment 0
gill-spacing 0
gill-size 0
gill-color 0
stalk-shape 0
stalk-root 0
stalk-surface-above-ring 0
stalk-surface-below-ring 0
stalk-color-above-ring 0
stalk-color-below-ring 0
veil-type 0
veil-color 0
ring-number 0
ring-type 0
spore-print-color 0
population 0
habitat 0
dtype: int64
Imputation method #3: Filling NA values with KNN Imputer
# Impute with KNN Imputer
= KNNImputer(n_neighbors = 20)
knn_impute = knn_impute.fit_transform(X_Na)
X_knn_impute = pd.DataFrame(X_knn_impute, columns = X_Na.columns)
X_knn_impute
# Check to make sure there are no NAs
sum() X_knn_impute.isna().
cap-shape 0
cap-surface 0
cap-color 0
bruises 0
odor 0
gill-attachment 0
gill-spacing 0
gill-size 0
gill-color 0
stalk-shape 0
stalk-root 0
stalk-surface-above-ring 0
stalk-surface-below-ring 0
stalk-color-above-ring 0
stalk-color-below-ring 0
veil-type 0
veil-color 0
ring-number 0
ring-type 0
spore-print-color 0
population 0
habitat 0
dtype: int64
Now that we have four different dataframes with four different imputation methods, lets see which best captured our real data!We can do this using the mean squared error!
# Calculate imputation accuracy using mean squared error
= mean_squared_error(X, X_mode_impute)
mse_mode = mean_squared_error(X, X_median_impute)
mse_median = mean_squared_error(X, X_knn_impute)
mse_knn
# Report results
print(f"Mode imputation performance: {mse_mode}")
print(f"Median Imputation performance: {mse_median}")
print(f"KNN Imputation performance: {mse_knn}")
Mode imputation performance: 0.454623785864554
Median Imputation performance: 0.26057472807842086
KNN Imputation performance: 0.10431979040777048
# Calculate imputation accuracy using R2
= r2_score(X, X_mode_impute)
r2_mode = r2_score(X, X_median_impute)
r2_median = r2_score(X, X_knn_impute)
r2_knn
# Report results
print(f"Mode imputation performance: {r2_mode}")
print(f"Median Imputation performance: {r2_median}")
print(f"KNN Imputation performance: {r2_knn}")
Mode imputation performance: 0.8516325253899928
Median Imputation performance: 0.8839749208235788
KNN Imputation performance: 0.968076133787858
It looks like our KNN Imputation was the most successfull in imputing NAs! Let’s run a random forest with our actual data, and our KNN imputed data to see how/if they differ!
Random Forest Classifier with original data
# Split actual data
= train_test_split(X, y, test_size = 0.3, random_state = 42) X_train, X_test, y_train, y_test
# Number of features to include for tuning
= [1,4,7,10,13,16,19,22]
num_features = []
accuracy
for feature in num_features:
= RandomForestClassifier(
rf_classifier = 50,
n_estimators = 3,
max_depth = 42,
random_state = feature
max_features
)
# Train model
rf_classifier.fit(X_train, y_train)
# Predict and evaluate
= rf_classifier.predict(X_test)
y_pred = accuracy_score(y_test, y_pred)
rf_accuracy
accuracy.append(rf_accuracy)print(f"Number of features: {feature}; Random Forest Accuracy: {rf_accuracy}")
Number of features: 1; Random Forest Accuracy: 0.916735028712059
Number of features: 4; Random Forest Accuracy: 0.9848236259228876
Number of features: 7; Random Forest Accuracy: 0.9868744872846595
Number of features: 10; Random Forest Accuracy: 0.9835931091058244
Number of features: 13; Random Forest Accuracy: 0.9823625922887613
Number of features: 16; Random Forest Accuracy: 0.9860541427399507
Number of features: 19; Random Forest Accuracy: 0.9819524200164069
Number of features: 22; Random Forest Accuracy: 0.9577522559474979
Random Forest Classifier with imputed data:
# Split imputed data
= train_test_split(X_knn_impute, y, test_size = 0.3, random_state = 42) X_train, X_test, y_train, y_test
# Number of features to include for tuning
# Number of features to include for tuning
= [1,4,7,10,13,16,19,22]
num_features = []
accuracy
for feature in num_features:
= RandomForestClassifier(
rf_classifier = 50,
n_estimators = 3,
max_depth = 42,
random_state = feature
max_features
)
# Train model
rf_classifier.fit(X_train, y_train)
# Predict and evaluate
= rf_classifier.predict(X_test)
y_pred = accuracy_score(y_test, y_pred)
rf_accuracy
accuracy.append(rf_accuracy)print(f"Number of features: {feature}; Random Forest Accuracy: {rf_accuracy}")
Number of features: 1; Random Forest Accuracy: 0.9232977850697293
Number of features: 4; Random Forest Accuracy: 0.9819524200164069
Number of features: 7; Random Forest Accuracy: 0.9844134536505332
Number of features: 10; Random Forest Accuracy: 0.9835931091058244
Number of features: 13; Random Forest Accuracy: 0.9868744872846595
Number of features: 16; Random Forest Accuracy: 0.9721082854799016
Number of features: 19; Random Forest Accuracy: 0.9708777686628384
Number of features: 22; Random Forest Accuracy: 0.9561115668580804