import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
from ipywidgets import interact
import ipywidgets as widgets
from ipywidgets import interact, FloatSlider
from IPython.display import display, clear_output
Introduction
In this week’s discussion section, we will be using the same dataset from our weekly lab - Water characteristics in the Hudson River after Hurricane Irene. However, rather than looking at a single predictor variable, we are going to add more! Can we improve our model if we add more variables?? Let’s find out.
Data Loading
Access the same .xlsx file we used in lab this week. If you lost access to it, you can find the data here. Instead of looking at only the dissolved oxygen and turbidity data this time, we are also going to read in data on rainfall. Read in each of these sheets on the excel sheet as its own dataframe. Load the following libraries:
# Load the data
= '../data/Hurricane_Irene_Hudson_River.xlsx'
fp = pd.read_excel(fp, sheet_name = 5).drop(['Piermont D.O. (ppm)'], axis = 1)
do_data = pd.read_excel(fp, sheet_name='Rainfall').drop(['Piermont Rainfall Daily Accumulation (Inches)'], axis = 1)
rainfall_data = pd.read_excel(fp, sheet_name='Turbidity').drop(['Piermont Turbidity in NTU'], axis = 1) turbidity_data
Data Wrangling
Perform the following data wrangling steps to get our data ready for our model.
- Merge the three dataframes together. While merging, or after, drop all columns for the Piedmont location.
- Update the column names to be shorter and not have spaces. Use snake case.
- Make your date column a datetime obect.
- Set the data as the index for the merged dataframe.
# Merge the two datasets on date
= rainfall_data.merge(turbidity_data, on = 'Date Time (ET)')
data = data.merge(do_data, on = 'Date Time (ET)')
data
data.head()
# Update the column names
= ['date', 'albany_rainfall', 'norrie_rainfall', 'albany_turbidity', 'norrie_turbidity','albany_do', 'norrie_do']
data.columns
# Convert data to datetime format and set it as index
'date'] = pd.to_datetime(data['date'])
data[
# Update index
'date', inplace=True) data.set_index(
Multiple Linear Regression
Now that our data is cleaned, let’s do the following to carry out a multiple linear regression.
- Define your predictors and target variables.
- Split the data into training and testing sets
- Create and fit the model
- Predict and Evaluate your model
# Define predictors and the target variable
= data[['albany_rainfall', 'norrie_rainfall', 'albany_do', 'norrie_do']] # Adjust as needed
X = data['albany_turbidity']
y
# Split the data into training and testing sets
= train_test_split(X, y, test_size=0.3, random_state=42)
X_train, X_test, y_train, y_test
# Create and fit the model
= LinearRegression()
model
model.fit(X_train, y_train)
# Predict and evaluate
= model.predict(X_test)
y_pred print(f"RMSE: {np.sqrt(mean_squared_error(y_test, y_pred))}")
print(f"R-squared: {r2_score(y_test, y_pred)}")
RMSE: 187.03290519070686
R-squared: 0.6382523355891789
Create a Widget for updating the predictor and target variables.
- Create the four different pieces to the widget: the predictor selector, the target selector, the evaluate button, and the output
- Wrap our worfklow into a function called
evaluate_model()
. This function will run a linear regression model based on what the user selects as predictors and the outcome variable. It will print the \(R^2\), MSE, and a scatterplot of the actual versus predicted target variable. - Create a warning for your widget to ensure that the user does not select the same variable as both a predictor variable and a target variable.
- Play around with your widget and see how your \(R^2\) changes based on your selected variables!
# Create a widget for selecting predictors
= widgets.SelectMultiple(
predictor_selector =data.columns, # Options for predictor: columns of data
options=[data.columns[0]], # Default selected: 1st column of data (albany_rainfall)
value='Predictors' # Name the predictor selection
description
)
# Create a dropdown for selecting the target variable
= widgets.Dropdown(
target_selector =data.columns, # Options for predictor: columns of data
options=data.columns[1], # Default selected: 2nd column of data (norrie_rainfall)
value='Target',
description
)
# Create button to evaluate the model
= widgets.Button(description="Evaluate Model")
evaluate_button
# Output widget to display results
= widgets.Output()
output
# Define the function to handle button clicks
def evaluate_model(b):
with output:
=True) # Clear previous displayed output before running
clear_output(wait
# Make sure the target variable is not also a predictor variable
= [item for item in predictor_selector.value] # Pull out predictor values selected by user
selected_predictors if target_selector.value in selected_predictors: # Make sure target variable is not also a predictor variable
print("Target variable must not be in the predictors.")
return
# Assign X and y variables
= data[selected_predictors]
X = data[target_selector.value]
y
# Split data into training and testing sets
= train_test_split(X, y, test_size=0.3, random_state=42)
X_train, X_test, y_train, y_test
# Create and fit the model
= LinearRegression()
model
model.fit(X_train, y_train)
# Predict and calculate R^2 and MSE
= model.predict(X_test)
y_pred = r2_score(y_test, y_pred)
r2 = mean_squared_error(y_test, y_pred)
mse
# Display the R^2 score and MSE
print(f"R^2: {r2:.4f}")
print(f"MSE: {mse:.4f}")
# Create a scatter plot of y test vs predicted y
plt.scatter(y_test, y_pred) 'Actual')
plt.xlabel('Predicted')
plt.ylabel('Actual vs Predicted')
plt.title(
plt.show()
# Display the widgets and connect the button to the function
display(predictor_selector, target_selector, evaluate_button, output) evaluate_button.on_click(evaluate_model)