CS 307: Week 03

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_regression
from sklearn.datasets import make_friedman1
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import plot_tree
from sklearn.metrics import mean_squared_error
np.random.seed(42)
n = 200
X = np.random.uniform(low=-2*np.pi, high=2*np.pi, size=(n,1))
y = np.sin(X) + np.random.normal(loc=0, scale=0.25, size=(n,1))
# setup figure
fig, (ax1, ax2) = plt.subplots(1, 2)
fig.set_size_inches(10, 5)
fig.set_dpi(100)

# add overall title
fig.suptitle('Simulated Sine Wave Data')

# x values to make predictions at for plotting purposes
x_plot = np.linspace(-2*np.pi, 2*np.pi, 1000).reshape((1000, 1))

# create subplot for "simulation study"
ax1.set_title("Simulation Study")
ax1.scatter(X, y, color="dodgerblue")
ax1.set_xlabel("x")
ax1.set_ylabel("y")
ax1.grid(True, linestyle='--', color='lightgrey')
# add true regression function, the "signal" that we want to learn
ax1.plot(x_plot, np.sin(x_plot), color='black')

# create subplot for "reality"
ax2.set_title("Reality")
ax2.scatter(X, y, color="dodgerblue")
ax2.set_xlabel("x")
ax2.set_ylabel("y")
ax2.grid(True, linestyle='--', color='lightgrey')

# show plot
plt.show()

knn010 = KNeighborsRegressor(n_neighbors=10)
knn010.fit(X, y)
KNeighborsRegressor(n_neighbors=10)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
dt100 = DecisionTreeRegressor(min_samples_split=100)
dt100.fit(X, y)
DecisionTreeRegressor(min_samples_split=100)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
# setup figure
fig, (ax1, ax2) = plt.subplots(1, 2)
fig.set_size_inches(10, 5)
fig.set_dpi(100)

# add overall title
fig.suptitle('Simulated Sine Wave Data')

# x values to make predictions at for plotting purposes
x_plot = np.linspace(-2*np.pi, 2*np.pi, 1000).reshape((1000, 1))

# create subplot for KNN
ax1.set_title("k-Nearest Neighbors, k=10")
ax1.scatter(X, y, color="dodgerblue")
ax1.set_xlabel("x")
ax1.set_ylabel("y")
ax1.grid(True, linestyle='--', color='lightgrey')
ax1.plot(x_plot, knn010.predict(x_plot), color='black')

# create subplot for decision tree
ax2.set_title("Decision Tree, min_samples_split=100")
ax2.scatter(X, y, color="dodgerblue")
ax2.set_xlabel("x")
ax2.set_ylabel("y")
ax2.grid(True, linestyle='--', color='lightgrey')
ax2.plot(x_plot, dt100.predict(x_plot), color='black')

# show plot
plt.show()

fig, ax = plt.subplots(1, 1)
fig.set_size_inches(3, 3)
fig.set_dpi(200)
plot_tree(dt100)
plt.show()

dt002 = DecisionTreeRegressor(min_samples_split=2)
dt100 = DecisionTreeRegressor(min_samples_split=100)
dt250 = DecisionTreeRegressor(min_samples_split=250)
dt002.fit(X, y)
dt100.fit(X, y)
dt250.fit(X, y)
DecisionTreeRegressor(min_samples_split=250)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
# setup figure
fig, (ax1, ax2, ax3) = plt.subplots(1, 3)
fig.set_size_inches(15, 5)
fig.set_dpi(100)

# add overall title
fig.suptitle('Simulated Sine Wave Data')

# x values to make predictions at for plotting purposes
x_plot = np.linspace(-2*np.pi, 2*np.pi, 1000).reshape((1000, 1))

# create subplot for decision tree with min_samples_split=250
ax1.set_title("k-Nearest Neighbors, min_samples_split=250")
ax1.scatter(X, y, color="dodgerblue")
ax1.set_xlabel("x")
ax1.set_ylabel("y")
ax1.grid(True, linestyle='--', color='lightgrey')
ax1.plot(x_plot, dt250.predict(x_plot), color='black')

# create subplot for decision tree with min_samples_split=100
ax2.set_title("k-Nearest Neighbors, min_samples_split=100")
ax2.scatter(X, y, color="dodgerblue")
ax2.set_xlabel("x")
ax2.set_ylabel("y")
ax2.grid(True, linestyle='--', color='lightgrey')
ax2.plot(x_plot, dt100.predict(x_plot), color='black')

# create subplot for decision tree with min_samples_split=2
ax3.set_title("k-Nearest Neighbors, min_samples_split=2")
ax3.scatter(X, y, color="dodgerblue")
ax3.set_xlabel("x")
ax3.set_ylabel("y")
ax3.grid(True, linestyle='--', color='lightgrey')
ax3.plot(x_plot, dt002.predict(x_plot), color='black')

# show plot
plt.show()

dt_d01 = DecisionTreeRegressor(max_depth=1)
dt_d05 = DecisionTreeRegressor(max_depth=5)
dt_d10 = DecisionTreeRegressor(max_depth=10)
dt_d01.fit(X, y)
dt_d05.fit(X, y)
dt_d10.fit(X, y)
DecisionTreeRegressor(max_depth=10)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
# setup figure
fig, (ax1, ax2, ax3) = plt.subplots(1, 3)
fig.set_size_inches(15, 5)
fig.set_dpi(100)

# add overall title
fig.suptitle('Simulated Sine Wave Data')

# x values to make predictions at for plotting purposes
x_plot = np.linspace(-2*np.pi, 2*np.pi, 1000).reshape((1000, 1))

# create subplot for decision tree with max_depth=1
ax1.set_title("k-Nearest Neighbors, max_depth=1")
ax1.scatter(X, y, color="dodgerblue")
ax1.set_xlabel("x")
ax1.set_ylabel("y")
ax1.grid(True, linestyle='--', color='lightgrey')
ax1.plot(x_plot, dt_d01.predict(x_plot), color='black')

# create subplot for decision tree with max_depth=5
ax2.set_title("k-Nearest Neighbors, max_depth=5")
ax2.scatter(X, y, color="dodgerblue")
ax2.set_xlabel("x")
ax2.set_ylabel("y")
ax2.grid(True, linestyle='--', color='lightgrey')
ax2.plot(x_plot, dt_d05.predict(x_plot), color='black')

# create subplot for decision tree with max_depth=10
ax3.set_title("k-Nearest Neighbors, max_depth=10")
ax3.scatter(X, y, color="dodgerblue")
ax3.set_xlabel("x")
ax3.set_ylabel("y")
ax3.grid(True, linestyle='--', color='lightgrey')
ax3.plot(x_plot, dt_d10.predict(x_plot), color='black')

# show plot
plt.show()

X_train, y_train = make_friedman1(n_samples=200, n_features=7, random_state=42)
X_test, y_test = make_friedman1(n_samples=200, n_features=7, random_state=1)
X_train.shape
(200, 7)
knn = KNeighborsRegressor(n_neighbors=25)
dt = DecisionTreeRegressor(min_samples_split=10)
knn.fit(X_train, y_train)
dt.fit(X_train, y_train)
DecisionTreeRegressor(min_samples_split=10)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
knn_pred = knn.predict(X_test)
dt_pred = dt.predict(X_test)
print(np.sqrt(mean_squared_error(y_test, knn_pred)))
print(np.sqrt(mean_squared_error(y_test, dt_pred)))
2.983752505149063
2.834171716110265