09- Logistic Regression Code Demo
1 R implementation
1.1 Binary (Binomial) Logistic Regression
Code
Estimate Std. Error z value Pr(>|z|)
(Intercept) -10.651330614 0.3611573721 -29.49221 3.623124e-191
balance 0.005498917 0.0002203702 24.95309 1.976602e-137
Code
pi_hat <- predict(logit_fit, type = "response")
eta_hat <- predict(logit_fit, type = "link") ## default gives us b0 + b1*x
predict(logit_fit, newdata = data.frame(balance = 2000), type = "response")
1
0.5857694
No Yes
FALSE 9625 233
TRUE 42 100
Code
library(ROCR)
# create an object of class prediction
pred <- ROCR::prediction(predictions = pred_prob, labels = Default$default)
# calculates the ROC curve
roc <- ROCR::performance(prediction.obj = pred, measure = "tpr", x.measure = "fpr")
plot(roc, colorize = TRUE, lwd = 3)
Code
auc <- ROCR::performance(prediction.obj = pred, measure = "auc")
auc@y.values
[[1]]
[1] 0.9479785
1.2 Multinomial Logistic Regression
Code
(Intercept) sesmiddle seshigh write
general 2.852198 -0.5332810 -1.1628226 -0.0579287
vocation 5.218260 0.2913859 -0.9826649 -0.1136037
academic general vocation
1 0.1482764 0.3382454 0.5134781
2 0.1202017 0.1806283 0.6991700
3 0.4186747 0.2368082 0.3445171
4 0.1726885 0.3508384 0.4764731
5 0.1001231 0.1689374 0.7309395
6 0.3533566 0.2377976 0.4088458
Code
dses <- data.frame(ses = c("low", "middle", "high"),
write = mean(multino_data$write))
predict(multino_fit, newdata = dses, type = "probs")
academic general vocation
1 0.4396845 0.3581917 0.2021238
2 0.4777488 0.2283353 0.2939159
3 0.7009007 0.1784939 0.1206054
We can also use glmnet
package.
Code
library(fastDummies) # https://jacobkap.github.io/fastDummies/
library(glmnet)
Loading required package: Matrix
Loaded glmnet 4.1-8
Code
prog <- multino_data$prog2
dummy_dat <- dummy_cols(multino_data, select_columns = c("ses"))
x <- dummy_dat |> dplyr::select(ses_middle, ses_high, write)
fit <- glmnet(x = x, y = prog, family = "multinomial", lambda = 0)
coef(fit)
$academic
4 x 1 sparse Matrix of class "dgCMatrix"
s0
-2.69052260
ses_middle .
ses_high 0.98278066
write 0.05793834
$general
4 x 1 sparse Matrix of class "dgCMatrix"
s0
0.1625919
ses_middle -0.5337675
ses_high -0.1805596
write .
$vocation
4 x 1 sparse Matrix of class "dgCMatrix"
s0
2.52793069
ses_middle 0.29119238
ses_high .
write -0.05566551
Code
, , s0
academic general vocation
1 0.4396037 0.3582719 0.2021245
2 0.4777583 0.2283218 0.2939199
3 0.7009084 0.1784762 0.1206154
Code
# model_mat <- model.matrix(prog2~ses+write, data=multino_data)
2 Python implementation
2.1 Binary (Binomial) Logistic Regression
Code
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
Code
# Load your dataset
= pd.read_csv("../data/Default.csv") Default
Code
'default'] = Default['default'].map({'Yes': 1, 'No': 0})
Default[from statsmodels.formula.api import logit
= logit(formula='default ~ balance', data=Default).fit() logit_fit
Optimization terminated successfully.
Current function value: 0.079823
Iterations 10
Code
1] logit_fit.summary2().tables[
Coef. Std.Err. z P>|z| [0.025 0.975]
Intercept -10.651331 0.361169 -29.491287 3.723665e-191 -11.359208 -9.943453
balance 0.005499 0.000220 24.952404 2.010855e-137 0.005067 0.005931
Code
= logit_fit.predict(Default[['balance']]) # Type = "response" in R
pi_hat = logit_fit.predict(Default[['balance']], which="linear") # Type = "link" in R
eta_hat = pd.DataFrame({'balance': [2000]})
new_data logit_fit.predict(new_data)
0 0.585769
dtype: float64
Code
from sklearn.metrics import confusion_matrix
= logit_fit.predict(Default[['balance']]) # Type = "response" in R
pred_prob # Create predictions based on a 0.5 threshold
= (pred_prob > 0.5).astype(int) # Convert to binary class (0 or 1)
pred_class ## C00: true negatives; C10: false negatives; C01: false postives; C11: true positives
=Default['default'], y_pred=pred_class) confusion_matrix(y_true
array([[9625, 42],
[ 233, 100]])
Code
from sklearn.metrics import roc_curve, roc_auc_score
# Calculate the ROC curve
= roc_curve(Default['default'], pred_prob)
fpr, tpr, thresholds
# Calculate the AUC (Area Under the Curve)
= roc_auc_score(Default['default'], pred_prob)
auc auc
0.9479784946837808
Code
plt.figure()='blue')
plt.plot(fpr, tpr, color0, 1], [0, 1], color='gray', linestyle='--') # Diagonal line
plt.plot(['False Positive Rate (FPR)')
plt.xlabel('True Positive Rate (TPR)')
plt.ylabel( plt.show()
2.2 Multinomial Logistic Regression
Code
= pd.read_stata("../data/hsbdemo.dta") multino_data
Code
'prog2'] = multino_data['prog'].cat.reorder_categories(
multino_data['academic'] + [cat for cat in multino_data['prog'].cat.categories if cat != 'academic'],
[=True
ordered
)'prog2'].unique() multino_data[
['vocation', 'general', 'academic']
Categories (3, object): ['academic' < 'general' < 'vocation']
Code
'ses'].unique() multino_data[
['low', 'middle', 'high']
Categories (3, object): ['low' < 'middle' < 'high']
Code
'prog_int'] = multino_data['prog2'].map({
multino_data['academic': 0,
'general': 1,
'vocation': 2
})'prog_int'] = multino_data['prog_int'].cat.codes multino_data[
Code
from statsmodels.formula.api import mnlogit
= mnlogit("prog_int ~ ses + write", data=multino_data).fit() multino_fit
Optimization terminated successfully.
Current function value: 0.899909
Iterations 6
Code
multino_fit.params
0 1
Intercept 2.852186 5.218200
ses[T.middle] -0.533291 0.291393
ses[T.high] -1.162832 -0.982670
write -0.057928 -0.113603
Code
= pd.DataFrame(multino_fit.predict(),
fitted_df =['academic', 'general', 'vocation'])
columns fitted_df.head()
academic general vocation
0 0.148278 0.338249 0.513473
1 0.120203 0.180629 0.699168
2 0.418679 0.236808 0.344513
3 0.172690 0.350841 0.476468
4 0.100125 0.168938 0.730937
Code
= pd.DataFrame({
dses 'ses': ['low', 'middle', 'high'],
'write': [multino_data['write'].mean()] * 3
}) multino_fit.predict(dses)
0 1 2
0 0.439684 0.358193 0.202123
1 0.477749 0.228334 0.293917
2 0.700902 0.178493 0.120605
We can also use sklearn
package.
Code
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
# Separate features (X) and target (y)
= multino_data[['ses', 'write']] # Independent variables
X = multino_data['prog_int'] # Dependent variable (categorical target)
y
= ColumnTransformer(
preprocessor =[
transformers'cat', OneHotEncoder(drop='first'), ['ses']), # One-hot encode 'ses'
('num', 'passthrough', ['write']) # Leave 'write' as is
(
]
)# Create a multinomial logistic regression model
= Pipeline([
model 'preprocessor', preprocessor), # Preprocessing step
('classifier', LogisticRegression(multi_class='multinomial', penalty=None,
(='lbfgs', max_iter=500))
solver
])
model.fit(X, y)
Pipeline(steps=[('preprocessor', ColumnTransformer(transformers=[('cat', OneHotEncoder(drop='first'), ['ses']), ('num', 'passthrough', ['write'])])), ('classifier', LogisticRegression(max_iter=500, multi_class='multinomial', penalty=None))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('preprocessor', ColumnTransformer(transformers=[('cat', OneHotEncoder(drop='first'), ['ses']), ('num', 'passthrough', ['write'])])), ('classifier', LogisticRegression(max_iter=500, multi_class='multinomial', penalty=None))])
ColumnTransformer(transformers=[('cat', OneHotEncoder(drop='first'), ['ses']), ('num', 'passthrough', ['write'])])
['ses']
OneHotEncoder(drop='first')
['write']
passthrough
LogisticRegression(max_iter=500, multi_class='multinomial', penalty=None)
Code
model.named_steps.classifier.coef_
array([[-0.71516627, -0.63453974, 0.05717724],
[ 0.4476491 , -0.0049987 , -0.00075151],
[ 0.26751717, 0.63953844, -0.05642573]])
Code
model.named_steps.classifier.intercept_
array([-1.97496981, -0.28559419, 2.260564 ])
Code
= pd.DataFrame({
dses 'ses': ['low', 'middle', 'high'],
'write': [multino_data['write'].mean()] * 3
})
pd.DataFrame(model.predict_proba(dses),=model.named_steps.classifier.classes_) columns
0 1 2
0 0.439686 0.358190 0.202124
1 0.477748 0.228334 0.293917
2 0.700903 0.178494 0.120603