import numpy as np
import pandas as pd
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_validate
CS 307: Week 06
Current Strategy: A Single Validation Set
# generate random data, but with a fixed seed
= make_regression(n_samples=400, n_features=5, random_state=42)
X, y
# train-test split and vtrain-validation split, do not control randomness
= train_test_split(X, y, test_size=0.2)
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2)
X_vtrain, X_val, y_vtrain, y_val
# setup two decision trees, with different values of min_samples_split
= DecisionTreeRegressor(min_samples_split=2)
dt_02 = DecisionTreeRegressor(min_samples_split=20)
dt_20
# fit the trees
dt_02.fit(X_vtrain, y_vtrain)
dt_20.fit(X_vtrain, y_vtrain)
# calculate validation RMSE for both trees
= mean_squared_error(y_val, dt_02.predict(X_val), squared=False)
dt_02_rmse = mean_squared_error(y_val, dt_20.predict(X_val), squared=False)
dt_20_rmse
# pick the "best" value of the tuning parameter (of the two considered)
= 20 if dt_02_rmse > dt_20_rmse else 2
best_min_samples_split
# display results
print(f"The validation RMSE when min_samples_split=2 is {dt_02_rmse}.")
print(f"The validation RMSE when min_samples_split=20 is {dt_20_rmse}.")
print(f"The chosen value of min_samples_split is {best_min_samples_split}.")
The validation RMSE when min_samples_split=2 is 46.33832623332579.
The validation RMSE when min_samples_split=20 is 46.15649041560169.
The chosen value of min_samples_split is 20.
Notice that if you repeatedly run this cell, you will most often select min_samples_split=2
, but sometimes you will select min_samples_split=20
.
New Strategy: Cross-Validation
# generate random data, but with a fixed seed
= make_regression(n_samples=400, n_features=5, random_state=42)
X, y
# train-test split, do not control randomness
= train_test_split(X, y, test_size=0.2)
X_train, X_test, y_train, y_test
# setup two decision trees, with different values of min_samples_split
= DecisionTreeRegressor(min_samples_split=2)
dt_02 = DecisionTreeRegressor(min_samples_split=20)
dt_20
# perform 5-fold cross-validation for both models
= cross_validate(dt_02, X_train, y_train, cv=5, scoring='neg_root_mean_squared_error')
cv_results_02 = cross_validate(dt_20, X_train, y_train, cv=5, scoring='neg_root_mean_squared_error')
cv_results_20
# calculate validation RMSE for both trees
= -np.mean(cv_results_02['test_score'])
dt_02_cv_rmse = -np.mean(cv_results_20['test_score'])
dt_20_cv_rmse
# pick the "best" value of the tuning parameter (of the two considered)
= 20 if dt_02_cv_rmse > dt_20_cv_rmse else 2
best_min_samples_split
# display results
print(f"The cross-validated RMSE when min_samples_split=2 is {dt_02_cv_rmse}.")
print(f"The cross-validated RMSE when min_samples_split=20 is {dt_20_cv_rmse}.")
print(f"The chosen value of min_samples_split is {best_min_samples_split}.")
The cross-validated RMSE when min_samples_split=2 is 43.1486920331054.
The cross-validated RMSE when min_samples_split=20 is 46.13119783789368.
The chosen value of min_samples_split is 2.
Notice that if you repeatedly run this cell, you will seemingly (but not necessarily) always select min_samples_split=2
.
Returning to Lab 02, With Pipelines!
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import mean_squared_error
from ISLP import load_data
# Load the Credit data into a pandas DataFrame
= load_data("Credit")
Credit
# Define the features and target
= ['Income', 'Balance', 'Cards', 'Age', 'Education', 'Gender', 'Student', 'Married', 'Ethnicity']
features = 'Rating'
target
# Split the data into training and testing sets
= train_test_split(Credit[features], Credit[target], test_size=0.2, random_state=42)
X_train, X_test, y_train, y_test
# Define the column transformer
= Pipeline(steps=[
numeric_transformer 'scaler', StandardScaler())
(
])
= Pipeline(steps=[
categorical_transformer 'onehot', OneHotEncoder(handle_unknown='ignore'))
(
])
= ColumnTransformer(
preprocessor =[
transformers'num', numeric_transformer, ['Income', 'Balance', 'Cards', 'Age', 'Education']),
('cat', categorical_transformer, ['Gender', 'Student', 'Married', 'Ethnicity'])
(
])
# Define the pipeline
= Pipeline([
pipeline 'preprocessor', preprocessor),
('dt', DecisionTreeRegressor())
(
])
# Define the parameter grid to search over
= {
param_grid 'dt__max_depth': [2, 4, 6, 8, 10],
'dt__min_samples_split': [2, 5, 10, 15, 20]
}
# Define the GridSearchCV object
= GridSearchCV(pipeline, param_grid=param_grid, cv=5, scoring="neg_root_mean_squared_error")
grid_search
# Fit the GridSearchCV object to the data
grid_search.fit(X_train, y_train)
# Print the best parameter and score
print("Best parameter: ", grid_search.best_params_)
print("Best score: ", -grid_search.best_score_)
# Evaluate the best model on the testing data
= grid_search.predict(X_test)
y_pred = np.sqrt(mean_squared_error(y_test, y_pred))
rmse print("RMSE on testing data: ", rmse)
Best parameter: {'dt__max_depth': 10, 'dt__min_samples_split': 2}
Best score: 40.46949398936537
RMSE on testing data: 37.386612984005986