CS 307: Week 06

import numpy as np
import pandas as pd
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_validate

Current Strategy: A Single Validation Set

# generate random data, but with a fixed seed
X, y = make_regression(n_samples=400, n_features=5, random_state=42)

# train-test split and vtrain-validation split, do not control randomness
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
X_vtrain, X_val, y_vtrain, y_val = train_test_split(X_train, y_train, test_size=0.2)

# setup two decision trees, with different values of min_samples_split
dt_02 = DecisionTreeRegressor(min_samples_split=2)
dt_20 = DecisionTreeRegressor(min_samples_split=20)

# fit the trees
dt_02.fit(X_vtrain, y_vtrain)
dt_20.fit(X_vtrain, y_vtrain)

# calculate validation RMSE for both trees
dt_02_rmse = mean_squared_error(y_val, dt_02.predict(X_val), squared=False)
dt_20_rmse = mean_squared_error(y_val, dt_20.predict(X_val), squared=False)

# pick the "best" value of the tuning parameter (of the two considered)
best_min_samples_split = 20 if dt_02_rmse > dt_20_rmse else 2

# display results
print(f"The validation RMSE when min_samples_split=2 is {dt_02_rmse}.")
print(f"The validation RMSE when min_samples_split=20 is {dt_20_rmse}.")
print(f"The chosen value of min_samples_split is {best_min_samples_split}.")
The validation RMSE when min_samples_split=2 is 46.33832623332579.
The validation RMSE when min_samples_split=20 is 46.15649041560169.
The chosen value of min_samples_split is 20.

Notice that if you repeatedly run this cell, you will most often select min_samples_split=2, but sometimes you will select min_samples_split=20.

New Strategy: Cross-Validation

# generate random data, but with a fixed seed
X, y = make_regression(n_samples=400, n_features=5, random_state=42)

# train-test split, do not control randomness
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# setup two decision trees, with different values of min_samples_split
dt_02 = DecisionTreeRegressor(min_samples_split=2)
dt_20 = DecisionTreeRegressor(min_samples_split=20)

# perform 5-fold cross-validation for both models
cv_results_02 = cross_validate(dt_02, X_train, y_train, cv=5, scoring='neg_root_mean_squared_error')
cv_results_20 = cross_validate(dt_20, X_train, y_train, cv=5, scoring='neg_root_mean_squared_error')

# calculate validation RMSE for both trees
dt_02_cv_rmse = -np.mean(cv_results_02['test_score'])
dt_20_cv_rmse = -np.mean(cv_results_20['test_score'])

# pick the "best" value of the tuning parameter (of the two considered)
best_min_samples_split = 20 if dt_02_cv_rmse > dt_20_cv_rmse else 2

# display results
print(f"The cross-validated RMSE when min_samples_split=2 is {dt_02_cv_rmse}.")
print(f"The cross-validated RMSE when min_samples_split=20 is {dt_20_cv_rmse}.")
print(f"The chosen value of min_samples_split is {best_min_samples_split}.")
The cross-validated RMSE when min_samples_split=2 is 43.1486920331054.
The cross-validated RMSE when min_samples_split=20 is 46.13119783789368.
The chosen value of min_samples_split is 2.

Notice that if you repeatedly run this cell, you will seemingly (but not necessarily) always select min_samples_split=2.

Returning to Lab 02, With Pipelines!

import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import mean_squared_error
from ISLP import load_data

# Load the Credit data into a pandas DataFrame
Credit = load_data("Credit")

# Define the features and target
features = ['Income', 'Balance', 'Cards', 'Age', 'Education', 'Gender', 'Student', 'Married', 'Ethnicity']
target = 'Rating'

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(Credit[features], Credit[target], test_size=0.2, random_state=42)

# Define the column transformer
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, ['Income', 'Balance', 'Cards', 'Age', 'Education']),
        ('cat', categorical_transformer, ['Gender', 'Student', 'Married', 'Ethnicity'])
    ])

# Define the pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('dt', DecisionTreeRegressor())
])

# Define the parameter grid to search over
param_grid = {
    'dt__max_depth': [2, 4, 6, 8, 10],
    'dt__min_samples_split': [2, 5, 10, 15, 20]
}

# Define the GridSearchCV object
grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=5, scoring="neg_root_mean_squared_error")

# Fit the GridSearchCV object to the data
grid_search.fit(X_train, y_train)

# Print the best parameter and score
print("Best parameter: ", grid_search.best_params_)
print("Best score: ", -grid_search.best_score_)

# Evaluate the best model on the testing data
y_pred = grid_search.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("RMSE on testing data: ", rmse)
Best parameter:  {'dt__max_depth': 10, 'dt__min_samples_split': 2}
Best score:  40.46949398936537
RMSE on testing data:  37.386612984005986