suppressPackageStartupMessages({
library(tidyverse)
library(tidymodels)
library(survival)
library(cowplot)
library(glmnet)
library(ggrepel)
library(geomtextpath)
})
theme_set(theme_cowplot())
options(repr.plot.width = 15, repr.plot.height = 9)
set.seed(42)
Regularized Cox Regression#
data <-
na.omit(pbc) |> # remove rows with missing data
mutate(surv=Surv(time, status==2)) # the surv variable will be our outcome
dim(data)
- 276
- 21
# separate 20% of the dataset for testing the models later
data_split <- initial_split(data, prop = 0.8, strata=time)
rec <- recipe(surv ~ ., data=data) |>
step_rm(time, status, id) |>
step_dummy(all_nominal_predictors()) |>
step_nzv(all_predictors())
prep_rec <- prep(rec)
train_data <-
prep_rec |>
bake(new_data=training(data_split))
train_data_x <- select(train_data, -surv)
cvfit <- cv.glmnet(as.matrix(train_data_x), train_data$surv, family = "cox", type.measure = "C", nfolds = 10, alpha=1)
cvfit
Call: cv.glmnet(x = as.matrix(train_data_x), y = train_data$surv, type.measure = "C", nfolds = 10, family = "cox", alpha = 1)
Measure: C-index
Lambda Index Measure SE Nonzero
min 0.0935748935632 13 0.805189039342 0.0420650566308 8
1se 0.2603776996468 2 0.785852648070 0.0290668508138 1
tidy(cvfit) |>
ggplot(aes(x=log(lambda), y=estimate)) +
geom_linerange(aes(ymin=conf.low, ymax=conf.high)) +
geom_point(size=3) +
geom_vline(xintercept=log(cvfit$lambda.min), linetype=2) +
labs(y='C-index')
tidy(cvfit$glmnet.fit) |>
ggplot(aes(x=log(lambda), y=estimate, color=term)) +
geom_hline(yintercept=0) +
geom_line() +
geom_labelpath(aes(label=term)) +
geom_vline(xintercept=log(cvfit$lambda.min), linetype=2) +
labs(y='parameter coefficients')
tidy(cvfit$glmnet.fit) |>
filter(lambda == cvfit$lambda.min) |>
mutate(exp.estimate=exp(estimate)) |>
ggplot(aes(x=exp.estimate, y=fct_reorder(term, estimate))) +
geom_vline(xintercept=1, color='darkgray') +
geom_point(size=5, shape=18) +
labs(x='hazard ratio at lambda.min', y='variable')
test_data <-
prep_rec |>
bake(new_data=testing(data_split))
test_data_x <- select(test_data, -surv) |> as.matrix()
z <- assess.glmnet(cvfit, newx=test_data_x, newy=test_data$surv, family='cox', s=c(cvfit$lambda.min))
z
- $deviance
- 115.865001937706
- $C
- 0.884467265725289