# Basics
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
# Probability
from scipy.stats import multivariate_normal
# Machine Learning
from sklearn.preprocessing import StandardScaler
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
# Classification Metrics
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
CS 307: Week 10
# Deal with pandas versus seaborn issues
="ignore", category=FutureWarning) warnings.simplefilter(action
Some Bayes Theorem
Given priors:
- \(\pi_A = 0.20\)
- \(\pi_B = 0.50\)
- \(\pi_C = 0.30\)
And likelihoods:
- \(X \mid Y = A \sim N(\mu = 2, \sigma = 1)\)
- \(X \mid Y = B \sim N(\mu = 3, \sigma = 2)\)
- \(X \mid Y = C \sim N(\mu = 4, \sigma = 1)\)
Calculate posteriors:
- \(P(Y = A \mid X = 3.4)\)
- \(P(Y = B \mid X = 3.4)\)
- \(P(Y = C \mid X = 3.4)\)
import numpy as np
from scipy.stats import norm
import matplotlib.pyplot as plt
= np.linspace(-2, 8, 1000)
x = [2, 3, 4]
mu = [1, 2, 1]
sigma
= plt.subplots(1, 2, figsize=(12, 5))
fig, axs
for i in range(3):
= norm.pdf(x, loc=mu[i], scale=sigma[i])
y 0].plot(x, y, label=f"Y={chr(65+i)}", linewidth=2)
axs[
0].axvline(x=3.4, color="red", linestyle="--")
axs[0].set_xlabel("X")
axs[0].set_ylabel("Density")
axs[0].grid(color="lightgrey", linestyle="--")
axs[0].set_title("Likelihoods")
axs[0].legend()
axs[
= [0.20, 0.50, 0.30]
priors
= []
z for i in range(len(x)):
sum(priors * norm.pdf(x[i], loc=mu, scale=sigma)))
z.append(np.
for i in range(3):
= priors[i] * norm.pdf(x, loc=mu[i], scale=sigma[i])
y 1].plot(x, y / z, label=f"Y={chr(65+i)}", linewidth=2)
axs[
1].axvline(x=3.4, color="red", linestyle="--")
axs[1].set_xlabel("X")
axs[1].set_ylabel("Density")
axs[1].grid(color="lightgrey", linestyle="--")
axs[1].set_title("Posteriors")
axs[1].legend()
axs[
plt.show()
import numpy as np
from scipy.stats import norm
= [0.20, 0.50, 0.30]
priors = norm.pdf(x=3.4, loc=[2, 3, 4], scale=[1, 2, 1])
likelihoods * likelihoods / (np.sum(priors * likelihoods)) priors
array([0.13152821, 0.42938969, 0.43908211])
- \(P(Y = A \mid X = 3.7) = 0.13152821\)
- \(P(Y = B \mid X = 3.7) = 0.42938969\)
- \(P(Y = C \mid X = 3.7) = 0.43908211\)
Generative Models
\[ p_k(\boldsymbol{x}) = P(Y = k \mid \boldsymbol{X} = \boldsymbol{x}) = \frac{\pi_k \cdot f_k(\boldsymbol{x})}{\sum_{g = 1}^{G} \pi_g \cdot f_g(\boldsymbol{x})} \]
Helper Functions
def make_plot(mu1, cov1, mu2, cov2):
# Create two multivariate normal distributions
= multivariate_normal(mu1, cov1)
dist1 = multivariate_normal(mu2, cov2)
dist2
# Generate random samples from each distribution
= dist1.rvs(size=100)
samples1 = dist2.rvs(size=100)
samples2
# Change color maps for scatter plot
= plt.get_cmap("Blues")
cmap1 = plt.get_cmap("Oranges")
cmap2
# Plot true distributions
= np.min([np.min(samples1[:, 0]), np.min(samples2[:, 0])]) - 1
x_min = np.max([np.max(samples1[:, 0]), np.max(samples2[:, 0])]) + 1
x_max = np.min([np.min(samples1[:, 1]), np.min(samples2[:, 1])]) - 1
y_min = np.max([np.max(samples1[:, 1]), np.max(samples2[:, 1])]) + 1
y_max = np.mgrid[x_min:x_max:0.01, y_min:y_max:0.01]
x, y = np.empty(x.shape + (2,))
pos 0] = x
pos[:, :, 1] = y
pos[:, :, = plt.subplots(1, 2, figsize=(10, 5))
fig, ax 0].contourf(x, y, dist1.pdf(pos), cmap="Blues")
ax[0].contour(x, y, dist1.pdf(pos), colors="k")
ax[0].contourf(x, y, dist2.pdf(pos), cmap="Oranges", alpha=0.5)
ax[0].contour(x, y, dist2.pdf(pos), colors="k")
ax[0].grid(color="lightgrey", linestyle="--")
ax[0].set_title("True Distribution")
ax[1].scatter(samples1[:, 0], samples1[:, 1], color=cmap1(0.5))
ax[1].scatter(samples2[:, 0], samples2[:, 1], color=cmap2(0.5))
ax[1].set_title("Sampled Data")
ax[1].grid(color="lightgrey", linestyle="--")
ax[0].set_xlim(x_min, x_max)
ax[0].set_ylim(y_min, y_max)
ax[1].set_xlim(x_min, x_max)
ax[1].set_ylim(y_min, y_max)
ax[ plt.show()
# Create mean (mu) and covariance (cov) arrays
def create_params(mu_1, mu_2, sigma_1, sigma_2, corr):
# Create mean vector
= np.array([mu_1, mu_2])
mu # Calculate covariance matrix
= np.array([[sigma_1**2, corr * sigma_1 * sigma_2], [corr * sigma_1 * sigma_2, sigma_2**2]])
cov # Return both
return mu, cov
# Print parameter information
def print_setup_parameter_info(mu1, cov1, mu2, cov2):
print("Mean X when Y = Blue:", mu1)
print("Covariance of X when Y = Blue:", "\n", cov1)
print("Mean X when Y = Orange:", mu2)
print("Covariance of X when Y = Orange:", "\n", cov2)
Linear Discriminant Analysis (LDA)
Linear Discriminant Analysis, LDA, assumes that the features are multivariate normal conditioned on the target classes. Importantly, LDA assumes the same covariance (shape) within each class.
\[ \boldsymbol{X} \mid Y = k \sim N(\boldsymbol{\mu}_k, \boldsymbol\Sigma) \]
\[ \boldsymbol\Sigma = \boldsymbol\Sigma_1 = \boldsymbol\Sigma_2 = \cdots = \boldsymbol\Sigma_G \]
# Make a two-class LDA setup
= create_params(0, 0, 1, 1, 0.5)
mu1, cov1 = create_params(2, 2, 1, 1, 0.5)
mu2, cov2 print_setup_parameter_info(mu1, cov1, mu2, cov2)
Mean X when Y = Blue: [0 0]
Covariance of X when Y = Blue:
[[1. 0.5]
[0.5 1. ]]
Mean X when Y = Orange: [2 2]
Covariance of X when Y = Orange:
[[1. 0.5]
[0.5 1. ]]
# Plot it!
make_plot(mu1, cov1, mu2, cov2)
Quadratic Discriminant Analysis (QDA)
Quadratic Discriminant Analysis, QDA, also assumes that the features are multivariate normal conditioned on the target classes. However, unlike LDA, QDA makes fewer assumptions on the covariances (shapes). QDA allows the covariance to be completely different within each class.
\[ \boldsymbol X \mid Y = k \sim N(\boldsymbol\mu_k, \boldsymbol\Sigma_k) \]
# Make a two-class QDA setup
= create_params(0, 0, 2, 1, 0.5)
mu1, cov1 = create_params(2, 2, 1, 0.5, -0.5)
mu2, cov2 print_setup_parameter_info(mu1, cov1, mu2, cov2)
Mean X when Y = Blue: [0 0]
Covariance of X when Y = Blue:
[[4. 1.]
[1. 1.]]
Mean X when Y = Orange: [2 2]
Covariance of X when Y = Orange:
[[ 1. -0.25]
[-0.25 0.25]]
# Plot it!
make_plot(mu1, cov1, mu2, cov2)
Naive Bayes (NB)
Naive Bayes comes in many forms. With only numeric features, it often assumes a multivariate normal conditioned on the target classes, but a very specific multivariate normal.
\[ {\boldsymbol X} \mid Y = k \sim N(\boldsymbol\mu_k, \boldsymbol\Sigma_k) \]
Naive Bayes assumes that the features \(X_1, X_2, \ldots, X_p\) are independent given \(Y = k\). This is the “naive” part of naive Bayes. The Bayes part is nothing new. Since \(X_1, X_2, \ldots, X_p\) are assumed independent, each \(\boldsymbol\Sigma_k\) is diagonal, that is, we assume no correlation between features. Independence implies zero correlation.
\[ \boldsymbol\Sigma_k = \begin{bmatrix} \sigma_{1}^2 & 0 & \cdots & 0 \\ 0 & \ddots & \ddots & \vdots \\ \vdots & \ddots & \ddots & 0 \\ 0 & \cdots & 0 & \sigma_{p}^2 \\ \end{bmatrix} \]
This will allow us to write the (joint) likelihood as a product of univariate distributions. In this case, the product of univariate normal distributions instead of a (joint) multivariate distribution.
\[ f_k(\boldsymbol x) = \prod_{j = 1}^{p} f_{kj}(\boldsymbol x_j) \]
Here, \(f_{kj}(\boldsymbol x_j)\) is the density for the \(j\)-th feature conditioned on the \(k\)-th class. Notice that there is a \(\sigma_{kj}\) for each feature for each class.
\[ f_{kj}(\boldsymbol x_j) = \frac{1}{\sigma_{kj}\sqrt{2\pi}}\exp\left[-\frac{1}{2}\left(\frac{x_j - \mu_{kj}}{\sigma_{kj}}\right)^2\right] \]
When \(p = 1\), this version of naive Bayes is equivalent to QDA.
# Make a two-class NB setup
= create_params(0, 0, 2, 1, 0)
mu1, cov1 = create_params(2, 2, 1, 0.5, 0)
mu2, cov2 print_setup_parameter_info(mu1, cov1, mu2, cov2)
Mean X when Y = Blue: [0 0]
Covariance of X when Y = Blue:
[[4 0]
[0 1]]
Mean X when Y = Orange: [2 2]
Covariance of X when Y = Orange:
[[1. 0. ]
[0. 0.25]]
# Plot it!
make_plot(mu1, cov1, mu2, cov2)
Generative Models with sklearn
# Function to generate data according to LDA, QDA, or NB
def generate_data(n1, n2, n3, mu1, mu2, mu3, cov1, cov2, cov3):
# Generate data for class A
= np.random.multivariate_normal(mu1, cov1, n1)
data1 = np.repeat("A", n1)
labels1
# Generate data for class B
= np.random.multivariate_normal(mu2, cov2, n2)
data2 = np.repeat("B", n2)
labels2
# Generate data for class C
= np.random.multivariate_normal(mu3, cov3, n3)
data3 = np.repeat("C", n3)
labels3
# Combine data and labels
= np.concatenate((data1, data2, data3))
data = np.concatenate((labels1, labels2, labels3))
labels
# Make X and y
= pd.DataFrame(data, columns=["x1", "x2"])
X = pd.Series(labels)
y
return X, y
# Setup sample sizes
= 500
n1 = 500
n2 = 500
n3
# Setup parameters
= create_params(0, 0, 2, 1, 0.5)
mu1, cov1 = create_params(2, 2, 1, 2, -0.5)
mu2, cov2 = create_params(0, 2, 1, 1, 0.5) mu3, cov3
# Generate train and test data
= generate_data(n1, n2, n3, mu1, mu2, mu3, cov1, cov2, cov3)
X_train, y_train = generate_data(n1, n2, n3, mu1, mu2, mu3, cov1, cov2, cov3) X_test, y_test
# Plot the train data
= sns.jointplot(x="x1", y="x2", data=X_train, hue=y_train, space=0)
p "Simulated Train Data", y=1.01)
p.fig.suptitle(True)
p.ax_joint.grid(8, 8) p.fig.set_size_inches(
# Gaussian Naive Bayes
= GaussianNB()
gnb
gnb.fit(X_train, y_train)= gnb.predict(X_test)
y_pred = accuracy_score(y_test, y_pred)
accuracy print("Gaussian NB, Test Accuracy:", accuracy)
# Linear Discriminant Analysis
= LinearDiscriminantAnalysis(store_covariance=True)
lda
lda.fit(X_train, y_train)= lda.predict(X_test)
y_pred = accuracy_score(y_test, y_pred)
accuracy print("LDA, Test Accuracy:", accuracy)
# Quadratic Discriminant Analysis
= QuadraticDiscriminantAnalysis(store_covariance=True)
qda
qda.fit(X_train, y_train)= qda.predict(X_test)
y_pred = accuracy_score(y_test, y_pred)
accuracy print("QDA, Test Accuracy:", accuracy)
Gaussian NB, Test Accuracy: 0.7466666666666667
LDA, Test Accuracy: 0.688
QDA, Test Accuracy: 0.758
print("Classes:", gnb.classes_)
print("Feature names:", gnb.feature_names_in_)
print("NB, Estimated Class Priors:", gnb.class_prior_)
print("NB, Estimated Mean of each feature per class:", "\n", gnb.theta_)
print("NB, Estimated Variance of each feature per class:", "\n", gnb.var_)
Classes: ['A' 'B' 'C']
Feature names: ['x1' 'x2']
NB, Estimated Class Priors: [0.33333333 0.33333333 0.33333333]
NB, Estimated Mean of each feature per class:
[[4.01566160e-02 2.38873059e-05]
[2.08143224e+00 1.97557520e+00]
[3.66427998e-02 2.06823633e+00]]
NB, Estimated Variance of each feature per class:
[[4.04651343 0.97403523]
[1.07528142 4.16455559]
[0.95628049 0.86020391]]
print("Classes:", qda.classes_)
print("Feature names:", qda.feature_names_in_)
print("QDA, Estimated Class Priors:", qda.priors_)
print("QDA, Estimated Mean of each feature per class:", "\n", qda.means_)
print("QDA, Estimated Covariance Matrix of each feature per class:")
qda.covariance_
Classes: ['A' 'B' 'C']
Feature names: ['x1' 'x2']
QDA, Estimated Class Priors: [0.33333333 0.33333333 0.33333333]
QDA, Estimated Mean of each feature per class:
[[4.01566160e-02 2.38873059e-05]
[2.08143224e+00 1.97557520e+00]
[3.66427998e-02 2.06823633e+00]]
QDA, Estimated Covariance Matrix of each feature per class:
[array([[4.05462268, 0.92459548],
[0.92459548, 0.9759872 ]]),
array([[ 1.07743629, -0.94226649],
[-0.94226649, 4.17290139]]),
array([[0.95819688, 0.41460533],
[0.41460533, 0.86192776]])]
Effect of Scaling
# Scale the X data
= StandardScaler()
scaler = scaler.fit_transform(X_train)
X_train_scaled = scaler.transform(X_test) X_test_scaled
= sns.jointplot(x="x1", y="x2", data=pd.DataFrame(X_train_scaled, columns=["x1", "x2"]), hue=y_train, space=0)
p "Simulated Train Data, Scaled", y=1.01)
p.fig.suptitle(True)
p.ax_joint.grid(8, 8) p.fig.set_size_inches(
# Fit NB model again
= GaussianNB()
gnbs
gnbs.fit(X_train_scaled, y_train)= gnbs.predict(X_test_scaled)
y_pred
# Calculate accuracy
= accuracy_score(y_test, y_pred)
accuracy print("Accuracy:", accuracy)
Accuracy: 0.7466666666666667
# Check estimated probabilities
== gnbs.predict_proba(X_test_scaled)
gnb.predict_proba(X_test) np.allclose(gnb.predict_proba(X_test), gnbs.predict_proba(X_test_scaled))
True
print("Classes:", gnbs.classes_)
print("NB, Estimated Class Priors:", gnbs.class_prior_)
print("NB, Estimated Mean of each feature per class:", "\n", gnbs.theta_)
print("NB, Estimated Variance of each feature per class:", "\n", gnbs.var_)
Classes: ['A' 'B' 'C']
NB, Estimated Class Priors: [0.33333333 0.33333333 0.33333333]
NB, Estimated Mean of each feature per class:
[[-0.39523727 -0.79023671]
[ 0.79251912 0.36795645]
[-0.39728185 0.42228026]]
NB, Estimated Variance of each feature per class:
[[1.37003745 0.3347804 ]
[0.36406053 1.43137695]
[0.3237701 0.29565605]]
Priors
# Setup sample sizes
= 500
n1_train = 300
n2_train = 700
n3_train = 500
n1_test = 500
n2_test = 500
n3_test
# Setup parameters
= create_params(0, 0, 2, 1, 0.5)
mu1, cov1 = create_params(2, 2, 1, 2, -0.5)
mu2, cov2 = create_params(0, 2, 1, 1, 0.5)
mu3, cov3
# Generate train and test data
= generate_data(n1_train, n2_train, n3_train, mu1, mu2, mu3, cov1, cov2, cov3)
X_train, y_train = generate_data(n1_test, n2_test, n3_test, mu1, mu2, mu3, cov1, cov2, cov3) X_test, y_test
# Quadratic Discriminant Analysis, Estimated Prior
= QuadraticDiscriminantAnalysis(store_covariance=True)
qda
qda.fit(X_train, y_train)= qda.predict(X_test)
y_pred = accuracy_score(y_test, y_pred)
accuracy print("QDA, Test Accuracy:", accuracy)
QDA, Test Accuracy: 0.756
# Quadratic Discriminant Analysis, Specified Prior
= QuadraticDiscriminantAnalysis(store_covariance=True, priors=[0.33, 0.33, 0.33])
qda
qda.fit(X_train, y_train)= qda.predict(X_test)
y_pred = accuracy_score(y_test, y_pred)
accuracy print("QDA, Test Accuracy:", accuracy)
QDA, Test Accuracy: 0.764
Penguins
from palmerpenguins import load_penguins
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
# Load the dataset
= load_penguins()
penguins
# Convert to pandas dataframe
= pd.DataFrame(penguins)
df
# Print the first 5 rows of the dataframe
df.head()
species | island | bill_length_mm | bill_depth_mm | flipper_length_mm | body_mass_g | sex | year | |
---|---|---|---|---|---|---|---|---|
0 | Adelie | Torgersen | 39.1 | 18.7 | 181.0 | 3750.0 | male | 2007 |
1 | Adelie | Torgersen | 39.5 | 17.4 | 186.0 | 3800.0 | female | 2007 |
2 | Adelie | Torgersen | 40.3 | 18.0 | 195.0 | 3250.0 | female | 2007 |
3 | Adelie | Torgersen | NaN | NaN | NaN | NaN | NaN | 2007 |
4 | Adelie | Torgersen | 36.7 | 19.3 | 193.0 | 3450.0 | female | 2007 |
# Create X and y
= df.drop("year", axis=1)
df = df.drop("species", axis=1)
X = df["species"] y
# Split the data into training and testing sets
= train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_test, y_train, y_test
# Print the shapes of the resulting dataframes
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)
X_train shape: (275, 6)
y_train shape: (275,)
X_test shape: (69, 6)
y_test shape: (69,)
X_train
island | bill_length_mm | bill_depth_mm | flipper_length_mm | body_mass_g | sex | |
---|---|---|---|---|---|---|
66 | Biscoe | 35.5 | 16.2 | 195.0 | 3350.0 | female |
229 | Biscoe | 51.1 | 16.3 | 220.0 | 6000.0 | male |
7 | Torgersen | 39.2 | 19.6 | 195.0 | 4675.0 | male |
140 | Dream | 40.2 | 17.1 | 193.0 | 3400.0 | female |
323 | Dream | 49.0 | 19.6 | 212.0 | 4300.0 | male |
... | ... | ... | ... | ... | ... | ... |
188 | Biscoe | 42.6 | 13.7 | 213.0 | 4950.0 | female |
71 | Torgersen | 39.7 | 18.4 | 190.0 | 3900.0 | male |
106 | Biscoe | 38.6 | 17.2 | 199.0 | 3750.0 | female |
270 | Biscoe | 47.2 | 13.7 | 214.0 | 4925.0 | female |
102 | Biscoe | 37.7 | 16.0 | 183.0 | 3075.0 | female |
275 rows × 6 columns
y_train
66 Adelie
229 Gentoo
7 Adelie
140 Adelie
323 Chinstrap
...
188 Gentoo
71 Adelie
106 Adelie
270 Gentoo
102 Adelie
Name: species, Length: 275, dtype: object
# Define the numeric features
= ["bill_length_mm", "bill_depth_mm", "flipper_length_mm", "body_mass_g"]
numeric_features
# Define the categorical features
= ["sex", "island"]
categorical_features
# Define the column transformer
= ColumnTransformer(
preprocessor =[
transformers"num", StandardScaler(), numeric_features),
("cat", OneHotEncoder(), categorical_features),
(
]
)
# Define the full pipeline
= Pipeline([("preprocessor", preprocessor), ("imputer", SimpleImputer()), ("classifier", SVC())])
pipe
# Define the parameter grid
= {
param_grid "imputer__strategy": ["mean", "median"],
"classifier__C": [0.1, 1, 10],
"classifier__kernel": ["linear", "rbf"],
}
# Define the grid search
= GridSearchCV(pipe, param_grid=param_grid, cv=5, verbose=1)
grid
# Fit the grid search to the training data
grid.fit(X_train, y_train)
# Print the best parameters and score
print("Best parameters:", grid.best_params_)
print("Best score:", grid.best_score_)
Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best parameters: {'classifier__C': 0.1, 'classifier__kernel': 'linear', 'imputer__strategy': 'mean'}
Best score: 0.9927272727272728
grid
GridSearchCV(cv=5, estimator=Pipeline(steps=[('preprocessor', ColumnTransformer(transformers=[('num', StandardScaler(), ['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g']), ('cat', OneHotEncoder(), ['sex', 'island'])])), ('imputer', SimpleImputer()), ('classifier', SVC())]), param_grid={'classifier__C': [0.1, 1, 10], 'classifier__kernel': ['linear', 'rbf'], 'imputer__strategy': ['mean', 'median']}, verbose=1)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
GridSearchCV(cv=5, estimator=Pipeline(steps=[('preprocessor', ColumnTransformer(transformers=[('num', StandardScaler(), ['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g']), ('cat', OneHotEncoder(), ['sex', 'island'])])), ('imputer', SimpleImputer()), ('classifier', SVC())]), param_grid={'classifier__C': [0.1, 1, 10], 'classifier__kernel': ['linear', 'rbf'], 'imputer__strategy': ['mean', 'median']}, verbose=1)
Pipeline(steps=[('preprocessor', ColumnTransformer(transformers=[('num', StandardScaler(), ['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g']), ('cat', OneHotEncoder(), ['sex', 'island'])])), ('imputer', SimpleImputer()), ('classifier', SVC())])
ColumnTransformer(transformers=[('num', StandardScaler(), ['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g']), ('cat', OneHotEncoder(), ['sex', 'island'])])
['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g']
StandardScaler()
['sex', 'island']
OneHotEncoder()
SimpleImputer()
SVC()
One-Hot Encoding
= X_train[categorical_features]
X_train_cat 10] X_train_cat.to_numpy()[:
array([['female', 'Biscoe'],
['male', 'Biscoe'],
['male', 'Torgersen'],
['female', 'Dream'],
['male', 'Dream'],
['male', 'Dream'],
['male', 'Dream'],
['female', 'Torgersen'],
['female', 'Dream'],
['female', 'Dream']], dtype=object)
= OneHotEncoder(sparse_output=False)
ohe
ohe.fit(X_train_cat) ohe.fit_transform(X_train_cat)
array([[1., 0., 0., 1., 0., 0.],
[0., 1., 0., 1., 0., 0.],
[0., 1., 0., 0., 0., 1.],
...,
[1., 0., 0., 1., 0., 0.],
[1., 0., 0., 1., 0., 0.],
[1., 0., 0., 1., 0., 0.]])
= OneHotEncoder(sparse_output=False, drop="first")
de
de.fit(X_train_cat) de.fit_transform(X_train_cat)
array([[0., 0., 0., 0.],
[1., 0., 0., 0.],
[1., 0., 0., 1.],
...,
[0., 0., 0., 0.],
[0., 0., 0., 0.],
[0., 0., 0., 0.]])
Imputation
= X_train[numeric_features]
X_train_num = X_train_num[X_train.isna().any(axis=1)]
X_train_num_nan X_train_num_nan.to_numpy()
array([[ 47.3, 13.8, 216. , 4725. ],
[ nan, nan, nan, nan],
[ 37.8, 17.1, 186. , 3300. ],
[ 44.5, 14.3, 216. , 4100. ],
[ 44.5, 15.7, 217. , 4875. ],
[ 46.2, 14.4, 214. , 4650. ],
[ 37.5, 18.9, 179. , 2975. ],
[ 37.8, 17.3, 180. , 3700. ],
[ 34.1, 18.1, 193. , 3475. ]])
= SimpleImputer()
imp
imp.fit(X_train_num) imp.fit_transform(X_train_num_nan)
array([[ 47.3 , 13.8 , 216. , 4725. ],
[ 41.2125, 16.2 , 200.125 , 3975. ],
[ 37.8 , 17.1 , 186. , 3300. ],
[ 44.5 , 14.3 , 216. , 4100. ],
[ 44.5 , 15.7 , 217. , 4875. ],
[ 46.2 , 14.4 , 214. , 4650. ],
[ 37.5 , 18.9 , 179. , 2975. ],
[ 37.8 , 17.3 , 180. , 3700. ],
[ 34.1 , 18.1 , 193. , 3475. ]])