05-Ridge Regression Code Demo

Author

Dr. Cheng-Han Yu

1 R implementation

1.1 MASS::lm.ridge()

Code
library(MASS)
fit <- MASS::lm.ridge(mpg ~ ., data = mtcars, lambda = 1) ## ridge fit
coef(fit) ## original scale
                      cyl         disp           hp         drat           wt 
16.537660830 -0.162402755  0.002333078 -0.014934856  0.924631319 -2.461146015 
        qsec           vs           am         gear         carb 
 0.492587517  0.374651744  2.308375781  0.685715851 -0.575791252 
Code
fit$coef ## standardized scale
       cyl       disp         hp       drat         wt       qsec         vs 
-0.2854708  0.2846046 -1.0078499  0.4865947 -2.3702010  0.8663632  0.1858566 
        am       gear       carb 
 1.1337179  0.4979561 -0.9153711 
Code
X <- scale(data.matrix(mtcars[, -1]), center = TRUE, scale = TRUE)
Code
df <- data.frame(cbind(mtcars[, 1, drop=FALSE], X))
ridge_fit <- lm.ridge(mpg ~ ., data = df, lambda = 0:40)
MASS::select(ridge_fit)
modified HKB estimator is 2.58585 
modified L-W estimator is 1.837435 
smallest value of GCV  at 15 
Code
par(mar = c(4, 4, 0, 0))
plot(ridge_fit$lambda, ridge_fit$GCV, type = "l", col = "darkgreen", 
     ylab = "GCV", xlab = "Lambda", lwd = 3)

1.2 glmnet::glmnet()

By defualt, glmnet() standardizes the x variables with standardize = TRUE, and does not standardize the response (standardize.response = FALSE).

glmnet() always return the coefficients at the original scale.

1.2.1 Use standardized X as the original scale

Code
Loading required package: Matrix
Loaded glmnet 4.1-8
Code
ridge_cv_fit <- cv.glmnet(x = X, y = mtcars$mpg, alpha = 0,
                          nfolds = 10, type.measure = "mse")
plot(ridge_cv_fit$glmnet.fit, "lambda")

Code
plot(ridge_cv_fit)

Code
ridge_cv_fit$lambda.min
[1] 3.014598
Code
# largest lambda s.t. error is within 1 s.e of the min
ridge_cv_fit$lambda.1se 
[1] 11.08883
Code
coef(ridge_cv_fit, s = "lambda.min")
11 x 1 sparse Matrix of class "dgCMatrix"
                    s1
(Intercept) 20.0906250
cyl         -0.6681360
disp        -0.6591218
hp          -0.7889394
drat         0.5644227
wt          -1.1786358
qsec         0.2866108
vs           0.3966957
am           0.7941621
gear         0.3997316
carb        -0.8635288

1.2.2 Use non-standardized X as the original scale

Code
library(glmnet)
ridge_cv_fit_ori <- cv.glmnet(x = data.matrix(mtcars[, -1]), y = mtcars$mpg, alpha = 0,
                          nfolds = 10, type.measure = "mse")
plot(ridge_cv_fit_ori$glmnet.fit, "lambda")

Code
plot(ridge_cv_fit_ori)

Code
ridge_cv_fit_ori$lambda.min
[1] 3.014598
Code
# largest lambda s.t. error is within 1 s.e of the min
ridge_cv_fit_ori$lambda.1se 
[1] 12.16998
Code
coef(ridge_cv_fit_ori, s = "lambda.min")
11 x 1 sparse Matrix of class "dgCMatrix"
                      s1
(Intercept) 21.051283516
cyl         -0.374112703
disp        -0.005318127
hp          -0.011506803
drat         1.055629523
wt          -1.204585685
qsec         0.160391657
vs           0.787069385
am           1.591536197
gear         0.541785546
carb        -0.534626533

2 Python implementation

Code
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import Ridge, RidgeCV
from sklearn.preprocessing import StandardScaler
Code
mtcars = pd.read_csv("../data/mtcars.csv")
X = mtcars.drop(columns=["mpg"])
y = mtcars["mpg"]
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
ridge_model = Ridge(alpha=1) ## alpha is the n*lambda
ridge_model.fit(X_scaled, y)
Ridge(alpha=1)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Code
ridge_model.coef_
array([-0.28547077,  0.28460463, -1.00784994,  0.4865947 , -2.37020101,
        0.86636324,  0.18585663,  1.13371791,  0.49795614, -0.91537115])
Code
lambdas = np.arange(1, 41) ## alpha must be > 0
# Ridge regression with cross-validation to select the best lambda
# Enable storing CV values (can only use LOOCV)
ridge_cv = RidgeCV(alphas=lambdas, store_cv_values=True)  
ridge_cv.fit(X_scaled, y)
RidgeCV(alphas=array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
       35, 36, 37, 38, 39, 40]),
        store_cv_values=True)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Code
# Optimal lambda and corresponding coefficients
optimal_lambda = ridge_cv.alpha_
optimal_lambda
13
Code
optimal_coefficients = ridge_cv.coef_
optimal_coefficients
array([-0.64745384, -0.62930084, -0.79208923,  0.55274563, -1.2273356 ,
        0.2909736 ,  0.37094879,  0.81866969,  0.3981575 , -0.89842163])
Code
ridge_cv.intercept_
20.090625000000003
Code
# Cross-validation mean squared error for each lambda
cv_mse = np.mean(ridge_cv.cv_values_, axis=0)

# Plot the CV MSE vs Lambda
plt.plot(lambdas, cv_mse, marker="o", linestyle="-")
plt.axvline(optimal_lambda, color="red", linestyle="--", 
            label=f"Optimal Lambda = {optimal_lambda}")
plt.xlabel("Lambda (Alpha)")
plt.ylabel("Mean Squared Error (MSE)")
plt.title("Cross-Validation MSE vs Lambda")
plt.legend()
plt.show()

Here we transform the coefficients back to the original scale when non-standardized X is used.

Code
# Reverse standardization
std_devs = scaler.scale_  # Feature standard deviations
means = scaler.mean_      # Feature means

coef_original = optimal_coefficients / std_devs
coef_original
array([-0.36833293, -0.00515876, -0.0117376 ,  1.05033188, -1.27442867,
        0.16543865,  0.74776248,  1.66690255,  0.54828706, -0.56512958])
Code
intercept_original = ridge_cv.intercept_ - np.sum(optimal_coefficients * means / std_devs)
intercept_original
21.21467495074396
Code
## the lambda size is matching the size used in cv.glmnet(). Should it be multiplied by 32?
lambdas = 5 * 10 ** np.linspace(-1, 3, 100)

# RidgeCV with 10-fold cross-validation
ridge_cv = RidgeCV(alphas=lambdas, scoring="neg_mean_squared_error", cv=10)
ridge_cv.fit(X_scaled, y)
RidgeCV(alphas=array([5.00000000e-01, 5.48749383e-01, 6.02251770e-01, 6.60970574e-01,
       7.25414389e-01, 7.96141397e-01, 8.73764200e-01, 9.58955131e-01,
       1.05245207e+00, 1.15506485e+00, 1.26768225e+00, 1.39127970e+00,
       1.52692775e+00, 1.67580133e+00, 1.83918989e+00, 2.01850863e+00,
       2.21531073e+00, 2.43130079e+00, 2.66834962e+00, 2.92851041e+00,
       3.21403656e+00, 3.52740116e+0...
       5.88405976e+02, 6.45774833e+02, 7.08737081e+02, 7.77838072e+02,
       8.53676324e+02, 9.36908711e+02, 1.02825615e+03, 1.12850986e+03,
       1.23853818e+03, 1.35929412e+03, 1.49182362e+03, 1.63727458e+03,
       1.79690683e+03, 1.97210303e+03, 2.16438064e+03, 2.37540508e+03,
       2.60700414e+03, 2.86118383e+03, 3.14014572e+03, 3.44630605e+03,
       3.78231664e+03, 4.15108784e+03, 4.55581378e+03, 5.00000000e+03]),
        cv=10, scoring='neg_mean_squared_error')
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Code
best_lambda = ridge_cv.alpha_

coefficients = []
for lam in lambdas:
    ridge = Ridge(alpha=lam)
    ridge.fit(X_scaled, y)
    coefficients.append(ridge.coef_)
Ridge(alpha=5000.0)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Code
coefficients = np.array(coefficients)
for i in range(coefficients.shape[1]):
    plt.plot(np.log(lambdas), coefficients[:, i])
plt.xlabel("Log(Lambda)")
plt.ylabel("Coefficients")
plt.show()