05-Ridge Regression Code Demo

Author

Dr. Cheng-Han Yu

1 R implementation

1.1 MASS::lm.ridge()

Code

library(MASS)
fit <- MASS::lm.ridge(mpg ~ ., data = mtcars, lambda = 1) ## ridge fit
coef(fit) ## original scale

                      cyl         disp           hp         drat           wt 
16.537660830 -0.162402755  0.002333078 -0.014934856  0.924631319 -2.461146015 
        qsec           vs           am         gear         carb 
 0.492587517  0.374651744  2.308375781  0.685715851 -0.575791252

Code

fit$coef ## standardized scale

       cyl       disp         hp       drat         wt       qsec         vs 
-0.2854708  0.2846046 -1.0078499  0.4865947 -2.3702010  0.8663632  0.1858566 
        am       gear       carb 
 1.1337179  0.4979561 -0.9153711

Code

X <- scale(data.matrix(mtcars[, -1]), center = TRUE, scale = TRUE)

Code

df <- data.frame(cbind(mtcars[, 1, drop=FALSE], X))
ridge_fit <- lm.ridge(mpg ~ ., data = df, lambda = 0:40)
MASS::select(ridge_fit)

modified HKB estimator is 2.58585 
modified L-W estimator is 1.837435 
smallest value of GCV  at 15

Code

par(mar = c(4, 4, 0, 0))
plot(ridge_fit$lambda, ridge_fit$GCV, type = "l", col = "darkgreen", 
     ylab = "GCV", xlab = "Lambda", lwd = 3)

1.2 glmnet::glmnet()

By defualt, glmnet() standardizes the x variables with standardize = TRUE, and does not standardize the response (standardize.response = FALSE).

glmnet() always return the coefficients at the original scale.

1.2.1 Use standardized `X` as the original scale

Code

library(glmnet)

Loading required package: Matrix

Loaded glmnet 4.1-8

Code

ridge_cv_fit <- cv.glmnet(x = X, y = mtcars$mpg, alpha = 0,
                          nfolds = 10, type.measure = "mse")
plot(ridge_cv_fit$glmnet.fit, "lambda")

Code

plot(ridge_cv_fit)

Code

ridge_cv_fit$lambda.min

[1] 3.014598

Code

# largest lambda s.t. error is within 1 s.e of the min
ridge_cv_fit$lambda.1se

[1] 11.08883

Code

coef(ridge_cv_fit, s = "lambda.min")

11 x 1 sparse Matrix of class "dgCMatrix"
                    s1
(Intercept) 20.0906250
cyl         -0.6681360
disp        -0.6591218
hp          -0.7889394
drat         0.5644227
wt          -1.1786358
qsec         0.2866108
vs           0.3966957
am           0.7941621
gear         0.3997316
carb        -0.8635288

1.2.2 Use non-standardized `X` as the original scale

Code

library(glmnet)
ridge_cv_fit_ori <- cv.glmnet(x = data.matrix(mtcars[, -1]), y = mtcars$mpg, alpha = 0,
                          nfolds = 10, type.measure = "mse")
plot(ridge_cv_fit_ori$glmnet.fit, "lambda")

Code

plot(ridge_cv_fit_ori)

Code

ridge_cv_fit_ori$lambda.min

[1] 3.014598

Code

# largest lambda s.t. error is within 1 s.e of the min
ridge_cv_fit_ori$lambda.1se

[1] 12.16998

Code

coef(ridge_cv_fit_ori, s = "lambda.min")

11 x 1 sparse Matrix of class "dgCMatrix"
                      s1
(Intercept) 21.051283516
cyl         -0.374112703
disp        -0.005318127
hp          -0.011506803
drat         1.055629523
wt          -1.204585685
qsec         0.160391657
vs           0.787069385
am           1.591536197
gear         0.541785546
carb        -0.534626533

2 Python implementation

Code

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import Ridge, RidgeCV
from sklearn.preprocessing import StandardScaler

Code

mtcars = pd.read_csv("../data/mtcars.csv")
X = mtcars.drop(columns=["mpg"])
y = mtcars["mpg"]
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
ridge_model = Ridge(alpha=1) ## alpha is the n*lambda
ridge_model.fit(X_scaled, y)

Ridge(alpha=1)

In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

Code

ridge_model.coef_

array([-0.28547077,  0.28460463, -1.00784994,  0.4865947 , -2.37020101,
        0.86636324,  0.18585663,  1.13371791,  0.49795614, -0.91537115])

Code

lambdas = np.arange(1, 41) ## alpha must be > 0
# Ridge regression with cross-validation to select the best lambda
# Enable storing CV values (can only use LOOCV)
ridge_cv = RidgeCV(alphas=lambdas, store_cv_values=True)  
ridge_cv.fit(X_scaled, y)

RidgeCV(alphas=array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
       35, 36, 37, 38, 39, 40]),
        store_cv_values=True)

Code

# Optimal lambda and corresponding coefficients
optimal_lambda = ridge_cv.alpha_
optimal_lambda

Code

optimal_coefficients = ridge_cv.coef_
optimal_coefficients

array([-0.64745384, -0.62930084, -0.79208923,  0.55274563, -1.2273356 ,
        0.2909736 ,  0.37094879,  0.81866969,  0.3981575 , -0.89842163])

Code

ridge_cv.intercept_

20.090625000000003

Code

# Cross-validation mean squared error for each lambda
cv_mse = np.mean(ridge_cv.cv_values_, axis=0)

# Plot the CV MSE vs Lambda
plt.plot(lambdas, cv_mse, marker="o", linestyle="-")
plt.axvline(optimal_lambda, color="red", linestyle="--", 
            label=f"Optimal Lambda = {optimal_lambda}")
plt.xlabel("Lambda (Alpha)")
plt.ylabel("Mean Squared Error (MSE)")
plt.title("Cross-Validation MSE vs Lambda")
plt.legend()
plt.show()

Here we transform the coefficients back to the original scale when non-standardized X is used.

Code

# Reverse standardization
std_devs = scaler.scale_  # Feature standard deviations
means = scaler.mean_      # Feature means

coef_original = optimal_coefficients / std_devs
coef_original

array([-0.36833293, -0.00515876, -0.0117376 ,  1.05033188, -1.27442867,
        0.16543865,  0.74776248,  1.66690255,  0.54828706, -0.56512958])

Code

intercept_original = ridge_cv.intercept_ - np.sum(optimal_coefficients * means / std_devs)
intercept_original

21.21467495074396

Code

## the lambda size is matching the size used in cv.glmnet(). Should it be multiplied by 32?
lambdas = 5 * 10 ** np.linspace(-1, 3, 100)

# RidgeCV with 10-fold cross-validation
ridge_cv = RidgeCV(alphas=lambdas, scoring="neg_mean_squared_error", cv=10)
ridge_cv.fit(X_scaled, y)

RidgeCV(alphas=array([5.00000000e-01, 5.48749383e-01, 6.02251770e-01, 6.60970574e-01,
       7.25414389e-01, 7.96141397e-01, 8.73764200e-01, 9.58955131e-01,
       1.05245207e+00, 1.15506485e+00, 1.26768225e+00, 1.39127970e+00,
       1.52692775e+00, 1.67580133e+00, 1.83918989e+00, 2.01850863e+00,
       2.21531073e+00, 2.43130079e+00, 2.66834962e+00, 2.92851041e+00,
       3.21403656e+00, 3.52740116e+0...
       5.88405976e+02, 6.45774833e+02, 7.08737081e+02, 7.77838072e+02,
       8.53676324e+02, 9.36908711e+02, 1.02825615e+03, 1.12850986e+03,
       1.23853818e+03, 1.35929412e+03, 1.49182362e+03, 1.63727458e+03,
       1.79690683e+03, 1.97210303e+03, 2.16438064e+03, 2.37540508e+03,
       2.60700414e+03, 2.86118383e+03, 3.14014572e+03, 3.44630605e+03,
       3.78231664e+03, 4.15108784e+03, 4.55581378e+03, 5.00000000e+03]),
        cv=10, scoring='neg_mean_squared_error')

Code

best_lambda = ridge_cv.alpha_

coefficients = []
for lam in lambdas:
    ridge = Ridge(alpha=lam)
    ridge.fit(X_scaled, y)
    coefficients.append(ridge.coef_)

Ridge(alpha=5000.0)

Code

coefficients = np.array(coefficients)
for i in range(coefficients.shape[1]):
    plt.plot(np.log(lambdas), coefficients[:, i])
plt.xlabel("Log(Lambda)")
plt.ylabel("Coefficients")
plt.show()

1 R implementation

1.1 MASS::lm.ridge()

1.2 glmnet::glmnet()

1.2.1 Use standardized X as the original scale

1.2.2 Use non-standardized X as the original scale

2 Python implementation

1.2.1 Use standardized `X` as the original scale

1.2.2 Use non-standardized `X` as the original scale