Show/Hide
birthrates <- read.csv("../data/birthrates.csv")
birthrates <- read.csv("../data/birthrates.csv")
library(splines)
For linear splines, change degree = 3
to degree = 1
.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
= pd.read_csv("../data/birthrates.csv") birthrates
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
'Year_centered'] = birthrates['Year'] - birthrates['Year'].mean()
birthrates[
# Polynomial regression with degree = 3
= PolynomialFeatures(degree=3, include_bias=False)
poly = poly.fit_transform(birthrates[['Year_centered']])
X_poly
= LinearRegression().fit(X_poly, birthrates['Birthrate'])
polyfit3
'Year'], birthrates['Birthrate'], color='blue')
plt.scatter(birthrates['Year'], polyfit3.predict(X_poly), color='red',
plt.plot(birthrates[=2)
linewidth"Cubic Polynomial Regression (Degree = 3)")
plt.title("Year")
plt.xlabel("Birthrate")
plt.ylabel( plt.show()
from patsy import dmatrix
from sklearn.linear_model import LinearRegression
For linear splines, change degree=3
to degree=1
.
NOTE:
Can’t find prediction or extrapolation for cubic splines.
Use patsy.cr()
to fit natural cubic splines.
= [1936, 1960, 1978]
knots # https://patsy.readthedocs.io/en/latest/API-reference.html
# Generate cubic spline basis functions with specified knots
= dmatrix(
spline_basis "bs(Year, degree=3, knots=knots, include_intercept=True)",
"Year": birthrates["Year"]},
{="dataframe"
return_type
)# Fit the cubic spline model
# import statsmodels.api as sm
# model = sm.OLS(birthrates["Birthrate"], spline_basis).fit()
# birthrates["Fitted"] = model.fittedvalues
= LinearRegression().fit(spline_basis, birthrates["Birthrate"])
cub_sp
# Plot the data and the fitted spline
"Year"], birthrates["Birthrate"], color="blue")
plt.scatter(birthrates["Year"], cub_sp.predict(spline_basis), color="red",
plt.plot(birthrates[=2)
linewidth"Cubic Spline (d=3) with 3 Knots")
plt.title("Year")
plt.xlabel("Birthrate")
plt.ylabel( plt.show()
# https://patsy.readthedocs.io/en/latest/API-reference.html
# Generate cubic spline basis functions with specified knots
= dmatrix(
spline_basis "bs(Year, degree=3, df=7, include_intercept=True)",
"Year": birthrates["Year"]},
{="dataframe"
return_type
)# Fit the cubic spline model
# import statsmodels.api as sm
# model = sm.OLS(birthrates["Birthrate"], spline_basis).fit()
# birthrates["Fitted"] = model.fittedvalues
= LinearRegression().fit(spline_basis, birthrates["Birthrate"])
cub_sp
# Plot the data and the fitted spline
"Year"], birthrates["Birthrate"], color="blue")
plt.scatter(birthrates["Year"], cub_sp.predict(spline_basis), color="red",
plt.plot(birthrates[=2)
linewidth"Cubic Spline (df=6)")
plt.title("Year")
plt.xlabel("Birthrate")
plt.ylabel( plt.show()
We use scipy.interpolate.make_smoothing_spline
.
Python has no functions for smoothing splines that can directly specify the degrees of freedom. Please let me know if you find one.
To have similar smoothing results, R and Python would use quite a different size of penalty term \(\lambda\), as well as the degrees of freedom and smoothing factor.
from scipy.interpolate import make_smoothing_spline
= birthrates["Year"].values
x = birthrates["Birthrate"].values
y = make_smoothing_spline(x, y, lam=20)
spline # Predict for the range of years
= np.linspace(1917, 2003, 500)
x_pred = spline(x_pred)
y_pred # Plot the original data
=(10, 6))
plt.figure(figsize='blue', label='Data', s=50)
plt.scatter(x, y, color='red', linewidth=3, label='Smoothing Spline')
plt.plot(x_pred, y_pred, color"Smoothing Spline")
plt.title("Year")
plt.xlabel("Birthrate")
plt.ylabel( plt.show()
To use a smoothing factor, use splrep
(make_splrep
) and BSpline
The smoothing factor is set unresonably high to 4500. Please let me know if you figure out why.
from scipy.interpolate import splrep, BSpline
= birthrates["Year"].values
x = birthrates["Birthrate"].values
y
# Fit the smoothing spline with a smoothing factor
= 4500 # Adjust this for the desired smoothness
smoothing_factor = splrep(x, y, s=smoothing_factor)
tck
# Predict for a range of years
= np.linspace(1917, 2003, 500)
x_pred = BSpline(*tck)(x_pred)
y_pred
# Plot the original data
=(10, 6))
plt.figure(figsize='blue', label='Data', s=50)
plt.scatter(x, y, color
# Plot the fitted smoothing spline
='red', linewidth=3, label='Smoothing Spline')
plt.plot(x_pred, y_pred, color
# Add labels and title
"Smoothing Spline with splrep")
plt.title("Year")
plt.xlabel("Birthrate")
plt.ylabel( plt.show()
https://kirenz.github.io/regression/docs/gam.html
https://gist.github.com/josef-pkt/453de603b019143e831fbdd4dfb6aa30
from statsmodels.gam.api import BSplines
from statsmodels.gam.api import GLMGam
from statsmodels.tools.eval_measures import mse, rmse
import statsmodels.api as sm
import patsy
= pd.read_csv("../data/Wage.csv")
Wage 'education'] = pd.Categorical(Wage['education'], categories=['1. < HS Grad', '2. HS Grad', '3. Some College', '4. College Grad', '5. Advanced Degree'], ordered=True)
Wage[
# penalization weights are taken from mgcv to match up its results
# sp = np.array([0.830689464223685, 425.361212061649])
# s_scale = np.array([2.443955e-06, 0.007945455])
= Wage[['year', 'age']].values
x_spline = patsy.dmatrix('education', data=Wage)
exog
# TODO: set `include_intercept=True` automatically if constraints='center'
= BSplines(x_spline, df=[4, 5], degree=[3, 3], variable_names=['year', 'age'],
bs ='center', include_intercept=True)
constraints# alpha = 1 / s_scale * sp / 2
= GLMGam(Wage['wage'], exog=exog, smoother=bs)
gam_bs = gam_bs.fit() res
= plt.subplots(1, 2, figsize=(12, 6))
fig, axes 0, cpr=False, include_constant=False, ax=axes[0])
res.plot_partial(0].set_title("Year")
axes[1, cpr=False, include_constant=False, ax=axes[1])
res.plot_partial(1].set_title("Age")
axes[
plt.tight_layout() plt.show()
print(res.summary())
Generalized Linear Model Regression Results
==============================================================================
Dep. Variable: wage No. Observations: 3000
Model: GLMGam Df Residuals: 2988
Model Family: Gaussian Df Model: 11.00
Link Function: Identity Scale: 1238.8
Method: PIRLS Log-Likelihood: -14934.
Date: Mon, 10 Feb 2025 Deviance: 3.7014e+06
Time: 21:43:15 Pearson chi2: 3.70e+06
No. Iterations: 3 Pseudo R-squ. (CS): 0.3358
Covariance Type: nonrobust
===================================================================================================
coef std err z P>|z| [0.025 0.975]
---------------------------------------------------------------------------------------------------
Intercept 85.6860 2.156 39.745 0.000 81.461 89.911
education[T.2. HS Grad] 10.7413 2.431 4.418 0.000 5.977 15.506
education[T.3. Some College] 23.2067 2.563 9.056 0.000 18.184 28.229
education[T.4. College Grad] 37.8704 2.547 14.871 0.000 32.879 42.862
education[T.5. Advanced Degree] 62.4355 2.764 22.591 0.000 57.019 67.852
year_s0 3.3874 4.257 0.796 0.426 -4.957 11.732
year_s1 1.8170 4.220 0.431 0.667 -6.454 10.088
year_s2 4.4943 1.754 2.563 0.010 1.057 7.931
age_s0 10.1360 5.932 1.709 0.087 -1.490 21.762
age_s1 47.6380 5.326 8.945 0.000 37.200 58.076
age_s2 6.7739 7.296 0.928 0.353 -7.526 21.074
age_s3 -10.0472 10.672 -0.941 0.346 -30.963 10.869
===================================================================================================
One option is to use pygam
package.
from pygam import LinearGAM, s, f
from pygam.datasets import wage
= wage(return_X_y=True)
X, y
## model
= LinearGAM(s(0) + s(1) + f(2)).fit(X, y)
gam
for i, term in enumerate(gam.terms):
if term.isintercept:
continue
= gam.generate_X_grid(term=i)
XX = gam.partial_dependence(term=i, X=XX, width=0.95)
pdep, confi
plt.figure()
plt.plot(XX[:, term.feature], pdep)='r', ls='--')
plt.plot(XX[:, term.feature], confi, crepr(term))
plt.title( plt.show()
gam.summary()
LinearGAM
=============================================== ==========================================================
Distribution: NormalDist Effective DoF: 25.1911
Link Function: IdentityLink Log Likelihood: -24118.6847
Number of Samples: 3000 AIC: 48289.7516
AICc: 48290.2307
GCV: 1255.6902
Scale: 1236.7251
Pseudo R-Squared: 0.2955
==========================================================================================================
Feature Function Lambda Rank EDoF P > x Sig. Code
================================= ==================== ============ ============ ============ ============
s(0) [0.6] 20 7.1 5.95e-03 **
s(1) [0.6] 20 14.1 1.11e-16 ***
f(2) [0.6] 5 4.0 1.11e-16 ***
intercept 1 0.0 1.11e-16 ***
==========================================================================================================
Significance codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
WARNING: Fitting splines and a linear function to a feature introduces a model identifiability problem
which can cause p-values to appear significant when they are not.
WARNING: p-values calculated in this manner behave correctly for un-penalized models or models with
known smoothing parameters, but when smoothing parameters have been estimated, the p-values
are typically lower than they should be, meaning that the tests reject the null too readily.
<string>:3: UserWarning: KNOWN BUG: p-values computed in this summary are likely much smaller than they should be.
Please do not make inferences based on these values!
Collaborate on a solution, and stay up to date at:
github.com/dswah/pyGAM/issues/163