This package provides an implementation of the model introduced by Cederkvist et al. (2018) to model within-cluster dependence of both risk and timing in competing risk. For interested readers, a vignette on computational details can be found by calling vignette("mmcif-comp-details", "mmcif")
.
The package can be installed from Github by calling
library(remotes)
install_github("boennecd/mmcif", build_vignettes = TRUE)
The code benefits from being build with automatic vectorization so having e.g. -O3
in the CXX17FLAGS
flags in your Makevars file may be useful.
The conditional cumulative incidence functions for cause \(k\) of individual \(j\) in cluster \(i\) is
\[\begin{align*} F_{kij}(t\mid \vec u_i, \vec\eta_i) &= \pi_k(\vec z_{ij}, \vec u_i) \Phi(-\vec x_{ij}(t)^\top\vec\gamma_k - \eta_{ik}) \\ \pi_k(\vec z_{ij}, \vec u_i) &= \frac{\exp(\vec z_{ij}^\top\vec\beta_k + u_{ik})}{1 + \sum_{l = 1}^K\exp(\vec z_{ij}^\top\vec\beta_l + u_{il})} \\ \begin{pmatrix} \vec U_i \\ \vec\eta_i \end{pmatrix} &\sim N^{(2K)}(\vec 0;\Sigma).\end{align*}\]
where there are \(K\) competing risks. The \(\vec x_{ij}(t)^\top\vec\gamma_k\)’s for the trajectory must be constrained to be monotonically decreasing in \(t\). The covariates for the trajectory in this package are defined as
\[\vec x_{ij}(t) = (\vec h(t)^\top, \vec z_{ij}^\top)^\top\]
for a spline basis \(\vec h\) and known covariates \(\vec z_{ij}\) which are also used in the risk part of the model.
We start with a simple example where there are \(K = 2\) competing risks and
\[\begin{align*} \vec x_{ij}(t) &= \left(\text{arcthan}\left(\frac{t - \delta/2}{\delta/2}\right), 1, a_{ij}, b_{ij}\right) \\ a_{ij} &\sim N(0, 1) \\ b_{ij} &\sim \text{Unif}(-1, 1)\\ \vec z_{ij} &= (1, a_{ij}, b_{ij}) \end{align*}\]
We set the parameters below and plot the conditional cumulative incidence functions when the random effects are zero and the covariates are zero, \(a_{ij} = b_{ij} = 0\).
# assign model parameters
n_causes <- 2L
delta <- 2
# set the betas
coef_risk <- c(.67, 1, .1, -.4, .25, .3) |>
matrix(ncol = n_causes)
# set the gammas
coef_traject <- c(-.8, -.45, .8, .4, -1.2, .15, .25, -.2) |>
matrix(ncol = n_causes)
# plot the conditional cumulative incidence functions when random effects and
# covariates are all zero
local({
probs <- exp(coef_risk[1, ]) / (1 + sum(exp(coef_risk[1, ])))
par(mar = c(5, 5, 1, 1), mfcol = c(1, 2))
for(i in 1:2){
plot(\(x) probs[i] * pnorm(
-coef_traject[1, i] * atanh((x - delta / 2) / (delta / 2)) -
coef_traject[2, i]),
xlim = c(0, delta), ylim = c(0, 1), bty = "l", xlab = "Time",
ylab = sprintf("Cumulative incidence; cause %d", i),
yaxs = "i", xaxs = "i")
grid()
}
})
# set the covariance matrix
Sigma <- c(0.306, 0.008, -0.138, 0.197, 0.008, 0.759, 0.251,
-0.25, -0.138, 0.251, 0.756, -0.319, 0.197, -0.25, -0.319, 0.903) |>
matrix(2L * n_causes)
Next, we assign a function to simulate clusters. The cluster sizes are uniformly sampled from one to the maximum size. The censoring times are drawn from a uniform distribution from zero to \(3\delta\).
library(mvtnorm)
# simulates a data set with a given number of clusters and maximum number of
# observations per cluster
sim_dat <- \(n_clusters, max_cluster_size){
stopifnot(max_cluster_size > 0,
n_clusters > 0)
cluster_id <- 0L
apply(rmvnorm(n_clusters, sigma = Sigma), 1, \(rng_effects){
U <- head(rng_effects, n_causes)
eta <- tail(rng_effects, n_causes)
n_obs <- sample.int(max_cluster_size, 1L)
cluster_id <<- cluster_id + 1L
# draw the cause
covs <- cbind(a = rnorm(n_obs), b = runif(n_obs, -1))
Z <- cbind(1, covs)
cond_logits_exp <- exp(Z %*% coef_risk + rep(U, each = n_obs)) |>
cbind(1)
cond_probs <- cond_logits_exp / rowSums(cond_logits_exp)
cause <- apply(cond_probs, 1,
\(prob) sample.int(n_causes + 1L, 1L, prob = prob))
# compute the observed time if needed
obs_time <- mapply(\(cause, idx){
if(cause > n_causes)
return(delta)
# can likely be done smarter but this is more general
coefs <- coef_traject[, cause]
offset <- sum(Z[idx, ] * coefs[-1]) + eta[cause]
rng <- runif(1)
eps <- .Machine$double.eps
root <- uniroot(
\(x) rng - pnorm(
-coefs[1] * atanh((x - delta / 2) / (delta / 2)) - offset),
c(eps^2, delta * (1 - eps)), tol = 1e-12)$root
}, cause, 1:n_obs)
cens <- runif(n_obs, max = 3 * delta)
has_finite_trajectory_prob <- cause <= n_causes
is_censored <- which(!has_finite_trajectory_prob | cens < obs_time)
if(length(is_censored) > 0){
obs_time[is_censored] <- pmin(delta, cens[is_censored])
cause[is_censored] <- n_causes + 1L
}
data.frame(covs, cause, time = obs_time, cluster_id)
}, simplify = FALSE) |>
do.call(what = rbind)
}
We then sample a data set.
# sample a data set
set.seed(8401828)
n_clusters <- 1000L
max_cluster_size <- 5L
dat <- sim_dat(n_clusters, max_cluster_size = max_cluster_size)
# show some stats
NROW(dat) # number of individuals
#> [1] 2962
table(dat$cause) # distribution of causes (3 is censored)
#>
#> 1 2 3
#> 1249 542 1171
# distribution of observed times by cause
tapply(dat$time, dat$cause, quantile,
probs = seq(0, 1, length.out = 11), na.rm = TRUE)
#> $`1`
#> 0% 10% 20% 30% 40% 50% 60% 70%
#> 1.615e-05 4.918e-03 2.737e-02 9.050e-02 2.219e-01 4.791e-01 8.506e-01 1.358e+00
#> 80% 90% 100%
#> 1.744e+00 1.953e+00 2.000e+00
#>
#> $`2`
#> 0% 10% 20% 30% 40% 50% 60% 70%
#> 0.001448 0.050092 0.157119 0.276072 0.431094 0.669010 0.964643 1.336520
#> 80% 90% 100%
#> 1.607221 1.863063 1.993280
#>
#> $`3`
#> 0% 10% 20% 30% 40% 50% 60% 70%
#> 0.002123 0.246899 0.577581 1.007699 1.526068 2.000000 2.000000 2.000000
#> 80% 90% 100%
#> 2.000000 2.000000 2.000000
Then we setup the C++ object to do the computation.
library(mmcif)
comp_obj <- mmcif_data(
~ a + b, dat, cause = cause, time = time, cluster_id = cluster_id,
max_time = delta, spline_df = 4L)
The mmcif_data
function does not work with
\[h(t) = \text{arcthan}\left(\frac{t - \delta/2}{\delta/2}\right)\]
but instead with \(\vec g(h(t))\) where \(\vec g\) returns a natural cubic spline basis functions. The knots are based on quantiles of \(h(t)\) evaluated on the event times. The knots differ for each type of competing risk. The degrees of freedom of the splines is controlled with the spline_df
argument. There is a constraints
element on the object returned by the mmcif_data
function which contains matrices that ensures that the \(\vec x_{ij}(t)^\top\vec\gamma_k\)s are monotonically decreasing if \(C\vec\zeta > \vec 0\) where \(C\) is one of matrices and \(\vec\zeta\) is the concatenated vector of model parameters.
The time to compute the log composite likelihood is illustrated below.
NCOL(comp_obj$pair_indices) # the number of pairs in the composite likelihood
#> [1] 3911
length(comp_obj$singletons) # the number of clusters with one observation
#> [1] 202
# we need to find the combination of the spline bases that yield a straight
# line to construct the true values using the splines. You can skip this
comb_slope <- sapply(comp_obj$spline, \(spline){
boundary_knots <- spline$boundary_knots
pts <- seq(boundary_knots[1], boundary_knots[2], length.out = 1000)
lm.fit(cbind(1, spline$expansion(pts)), pts)$coef
})
# assign a function to compute the log composite likelihood
ll_func <- \(par, n_threads = 1L)
mmcif_logLik(
comp_obj, par = par, n_threads = n_threads, is_log_chol = FALSE)
# the log composite likelihood at the true parameters
coef_traject_spline <-
rbind(comb_slope[-1, ] * rep(coef_traject[1, ], each = NROW(comb_slope) - 1),
coef_traject[2, ] + comb_slope[1, ] * coef_traject[1, ],
coef_traject[-(1:2), ])
true_values <- c(coef_risk, coef_traject_spline, Sigma)
ll_func(true_values)
#> [1] -7087
# check the time to compute the log composite likelihood
bench::mark(
`one thread` = ll_func(n_threads = 1L, true_values),
`two threads` = ll_func(n_threads = 2L, true_values),
`three threads` = ll_func(n_threads = 3L, true_values),
`four threads` = ll_func(n_threads = 4L, true_values),
min_time = 4)
#> # A tibble: 4 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 one thread 44ms 45.4ms 21.9 0B 0
#> 2 two threads 23ms 23.3ms 42.7 0B 0
#> 3 three threads 15.6ms 15.8ms 62.8 0B 0
#> 4 four threads 11.9ms 12.2ms 79.7 0B 0
# next, we compute the gradient of the log composite likelihood at the true
# parameters. First we assign a few functions to verify the result. You can
# skip these
upper_to_full <- \(x){
dim <- (sqrt(8 * length(x) + 1) - 1) / 2
out <- matrix(0, dim, dim)
out[upper.tri(out, TRUE)] <- x
out[lower.tri(out)] <- t(out)[lower.tri(out)]
out
}
d_upper_to_full <- \(x){
dim <- (sqrt(8 * length(x) + 1) - 1) / 2
out <- matrix(0, dim, dim)
out[upper.tri(out, TRUE)] <- x
out[upper.tri(out)] <- out[upper.tri(out)] / 2
out[lower.tri(out)] <- t(out)[lower.tri(out)]
out
}
# then we can compute the gradient with the function from the package and with
# numerical differentiation
gr_func <- function(par, n_threads = 1L)
mmcif_logLik_grad(comp_obj, par, n_threads = n_threads, is_log_chol = FALSE)
gr_package <- gr_func(true_values)
true_values_upper <-
c(coef_risk, coef_traject_spline, Sigma[upper.tri(Sigma, TRUE)])
gr_num <- numDeriv::grad(
\(x) ll_func(c(head(x, -10), upper_to_full(tail(x, 10)))),
true_values_upper, method = "simple")
# they are very close but not exactly equal as expected (this is due to the
# adaptive quadrature)
rbind(
`Numerical gradient` =
c(head(gr_num, -10), d_upper_to_full(tail(gr_num, 10))),
`Gradient package` = gr_package)
#> [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9]
#> Numerical gradient -98.12 -35.08 -34.63 48.15 -5.095 65.07 54.22 -43.89 -27.01
#> Gradient package -98.03 -35.02 -34.61 48.15 -5.050 65.09 54.26 -43.86 -26.99
#> [,10] [,11] [,12] [,13] [,14] [,15] [,16] [,17] [,18]
#> Numerical gradient -25.23 -36.50 -69.74 -60.26 -66.81 42.02 14.85 -7.650 -2.324
#> Gradient package -25.21 -36.43 -69.63 -60.22 -66.80 42.04 14.86 -7.641 -2.294
#> [,19] [,20] [,21] [,22] [,23] [,24] [,25] [,26] [,27]
#> Numerical gradient -5.068 -43.15 10.28 -1.492 4.406 0.2705 -1.492 10.41 -0.2874
#> Gradient package -5.024 -43.13 10.27 -1.464 4.420 0.2806 -1.464 10.86 -0.3570
#> [,28] [,29] [,30] [,31] [,32] [,33] [,34] [,35] [,36]
#> Numerical gradient -4.417 4.406 -0.2874 -36.46 6.307 0.2705 -4.417 6.307 8.955
#> Gradient package -4.402 4.420 -0.3570 -36.42 6.311 0.2806 -4.402 6.311 8.967
# check the time to compute the gradient of the log composite likelihood
bench::mark(
`one thread` = gr_func(n_threads = 1L, true_values),
`two threads` = gr_func(n_threads = 2L, true_values),
`three threads` = gr_func(n_threads = 3L, true_values),
`four threads` = gr_func(n_threads = 4L, true_values),
min_time = 4)
#> # A tibble: 4 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 one thread 68.8ms 69.9ms 14.2 336B 0
#> 2 two threads 35.8ms 36.5ms 27.1 336B 0
#> 3 three threads 24.5ms 24.8ms 40.3 336B 0
#> 4 four threads 18.9ms 19.3ms 51.1 336B 0
Then we optimize the parameters.
# find the starting values
system.time(start <- mmcif_start_values(comp_obj, n_threads = 4L))
#> user system elapsed
#> 0.084 0.008 0.026
# the maximum likelihood without the random effects. Note that this is not
# comparable with the composite likelihood
attr(start, "logLik")
#> [1] -2650
# examples of using log_chol and log_chol_inv
log_chol(Sigma)
#> [1] -0.59209 0.01446 -0.13801 -0.24947 0.29229 -0.24852 0.35613 -0.29291
#> [9] -0.18532 -0.21077
stopifnot(all.equal(Sigma, log_chol(Sigma) |> log_chol_inv()))
# set true values
truth <- c(coef_risk, coef_traject_spline, log_chol(Sigma))
# we can verify that the gradient is correct
gr_package <- mmcif_logLik_grad(
comp_obj, truth, n_threads = 4L, is_log_chol = TRUE)
gr_num <- numDeriv::grad(
mmcif_logLik, truth, object = comp_obj, n_threads = 4L, is_log_chol = TRUE,
method = "simple")
rbind(`Numerical gradient` = gr_num, `Gradient package` = gr_package)
#> [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9]
#> Numerical gradient -98.12 -35.08 -34.63 48.15 -5.095 65.07 54.22 -43.89 -27.01
#> Gradient package -98.03 -35.02 -34.61 48.15 -5.050 65.09 54.26 -43.86 -26.99
#> [,10] [,11] [,12] [,13] [,14] [,15] [,16] [,17] [,18]
#> Numerical gradient -25.23 -36.50 -69.74 -60.26 -66.81 42.02 14.85 -7.650 -2.324
#> Gradient package -25.21 -36.43 -69.63 -60.22 -66.80 42.04 14.86 -7.641 -2.294
#> [,19] [,20] [,21] [,22] [,23] [,24] [,25] [,26] [,27]
#> Numerical gradient -5.068 -43.15 5.159 -4.346 17.90 27.54 -25.51 -46.20 3.408
#> Gradient package -5.024 -43.13 5.150 -4.263 18.55 27.55 -25.61 -46.14 3.422
#> [,28] [,29] [,30]
#> Numerical gradient -9.259 6.518 11.75
#> Gradient package -9.233 6.520 11.77
# optimize the log composite likelihood
system.time(fit <- mmcif_fit(start$upper, comp_obj, n_threads = 4L))
#> user system elapsed
#> 46.676 0.019 11.704
# the log composite likelihood at different points
mmcif_logLik(comp_obj, truth, n_threads = 4L, is_log_chol = TRUE)
#> [1] -7087
mmcif_logLik(comp_obj, start$upper, n_threads = 4L, is_log_chol = TRUE)
#> [1] -7572
-fit$value
#> [1] -7050
We may reduce the estimation time by using a different number of quadrature nodes starting with fewer nodes successively updating the fits as shown below.
# the number of nodes we used
length(comp_obj$ghq_data[[1]])
#> [1] 5
# with successive updates
ghq_lists <- lapply(
setNames(c(2L, 6L), c(2L, 6L)),
\(n_nodes)
fastGHQuad::gaussHermiteData(n_nodes) |>
with(list(node = x, weight = w)))
system.time(
fits <- mmcif_fit(
start$upper, comp_obj, n_threads = 4L, ghq_data = ghq_lists))
#> user system elapsed
#> 39.285 0.012 9.831
# compare the estimates
rbind(sapply(fits, `[[`, "par") |> t(),
`Previous` = fit$par)
#> cause1:risk:(Intercept) cause1:risk:a cause1:risk:b
#> 0.5584 0.9124 0.08503
#> 0.5854 0.9494 0.08946
#> Previous 0.5868 0.9495 0.08984
#> cause2:risk:(Intercept) cause2:risk:a cause2:risk:b cause1:spline1
#> -0.4267 0.1942 0.4920 -2.751
#> -0.4068 0.2085 0.5040 -2.761
#> Previous -0.4088 0.2086 0.5051 -2.761
#> cause1:spline2 cause1:spline3 cause1:spline4
#> -3.624 -6.512 -4.968
#> -3.636 -6.536 -4.983
#> Previous -3.636 -6.536 -4.983
#> cause1:traject:(Intercept) cause1:traject:a cause1:traject:b
#> 2.782 0.7863 0.3197
#> 2.789 0.7898 0.3207
#> Previous 2.789 0.7898 0.3207
#> cause2:spline1 cause2:spline2 cause2:spline3 cause2:spline4
#> -2.819 -3.233 -6.063 -4.771
#> -2.890 -3.310 -6.214 -4.881
#> Previous -2.890 -3.309 -6.213 -4.880
#> cause2:traject:(Intercept) cause2:traject:a cause2:traject:b
#> 3.326 0.2420 -0.3430
#> 3.365 0.2468 -0.3479
#> Previous 3.363 0.2468 -0.3477
#> vcov:risk1:risk1 vcov:risk1:risk2 vcov:risk2:risk2 vcov:risk1:traject1
#> -1.0779 -0.26819 -0.3546 -0.18162
#> -0.4792 0.07051 -0.1160 -0.09138
#> Previous -0.4770 0.08131 -0.1043 -0.09116
#> vcov:risk2:traject1 vcov:traject1:traject1 vcov:risk1:traject2
#> 0.2717 -0.2781 0.4359
#> 0.2791 -0.2546 0.2575
#> Previous 0.2768 -0.2536 0.2575
#> vcov:risk2:traject2 vcov:traject1:traject2 vcov:traject2:traject2
#> -0.4860 -0.03015 -0.2906
#> -0.4972 -0.10351 -0.1518
#> Previous -0.4939 -0.10619 -0.1505
print(fits[[length(fits)]]$value, digits = 10)
#> [1] 7050.314423
print(fit $value, digits = 10)
#> [1] 7050.351508
Then we compute the sandwich estimator. The Hessian is currently computed with numerical differentiation which is why it takes a while.
system.time(sandwich_est <- mmcif_sandwich(comp_obj, fit$par, n_threads = 4L))
#> user system elapsed
#> 23.754 0.009 5.953
# setting order equal to zero yield no Richardson extrapolation and just
# standard symmetric difference quotient. This is less precise but faster
system.time(sandwich_est_simple <-
mmcif_sandwich(comp_obj, fit$par, n_threads = 4L, order = 0L))
#> user system elapsed
#> 4.935 0.000 1.236
We show the estimated and true the conditional cumulative incidence functions (the dashed curves are the estimates) when the random effects are zero and the covariates are zero, \(a_{ij} = b_{ij} = 0\).
local({
# get the estimates
coef_risk_est <- fit$par[comp_obj$indices$coef_risk] |>
matrix(ncol = n_causes)
coef_traject_time_est <- fit$par[comp_obj$indices$coef_trajectory_time] |>
matrix(ncol = n_causes)
coef_traject_est <- fit$par[comp_obj$indices$coef_trajectory] |>
matrix(ncol = n_causes)
coef_traject_intercept_est <- coef_traject_est[5, ]
# compute the risk probabilities
probs <- exp(coef_risk[1, ]) / (1 + sum(exp(coef_risk[1, ])))
probs_est <- exp(coef_risk_est[1, ]) / (1 + sum(exp(coef_risk_est[1, ])))
# plot the estimated and true conditional cumulative incidence functions. The
# estimates are the dashed lines
par(mar = c(5, 5, 1, 1), mfcol = c(1, 2))
pts <- seq(1e-8, delta * (1 - 1e-8), length.out = 1000)
for(i in 1:2){
true_vals <- probs[i] * pnorm(
-coef_traject[1, i] * atanh((pts - delta / 2) / (delta / 2)) -
coef_traject[2, i])
estimates <- probs_est[i] * pnorm(
-comp_obj$time_expansion(pts, cause = i) %*% coef_traject_time_est[, i] -
coef_traject_intercept_est[i]) |> drop()
matplot(pts, cbind(true_vals, estimates), xlim = c(0, delta),
ylim = c(0, 1), bty = "l", xlab = "Time", lty = c(1, 2),
ylab = sprintf("Cumulative incidence; cause %d", i),
yaxs = "i", xaxs = "i", type = "l", col = "black")
grid()
}
})
Further illustrations of the estimated model are given below.
# the number of call we made
fit$counts
#> function gradient
#> 438 269
fit$outer.iterations
#> [1] 3
# compute the standard errors from the sandwich estimator
SEs <- diag(sandwich_est) |> sqrt()
SEs_simple <- diag(sandwich_est_simple) |> sqrt()
# compare the estimates with the true values
rbind(`Estimate AGHQ` = fit$par[comp_obj$indices$coef_risk],
`Standard errors` = SEs[comp_obj$indices$coef_risk],
`Standard errors simple` = SEs_simple[comp_obj$indices$coef_risk],
Truth = truth[comp_obj$indices$coef_risk])
#> cause1:risk:(Intercept) cause1:risk:a cause1:risk:b
#> Estimate AGHQ 0.58676 0.94946 0.08984
#> Standard errors 0.07241 0.06901 0.10193
#> Standard errors simple 0.07241 0.06901 0.10193
#> Truth 0.67000 1.00000 0.10000
#> cause2:risk:(Intercept) cause2:risk:a cause2:risk:b
#> Estimate AGHQ -0.40878 0.20863 0.5051
#> Standard errors 0.09896 0.07073 0.1233
#> Standard errors simple 0.09896 0.07073 0.1233
#> Truth -0.40000 0.25000 0.3000
rbind(`Estimate AGHQ` = fit$par[comp_obj$indices$coef_trajectory],
`Standard errors` = SEs[comp_obj$indices$coef_trajectory],
`Standard errors simple` = SEs_simple[comp_obj$indices$coef_trajectory],
Truth = truth[comp_obj$indices$coef_trajectory])
#> cause1:spline1 cause1:spline2 cause1:spline3
#> Estimate AGHQ -2.7613 -3.6362 -6.5362
#> Standard errors 0.1115 0.1320 0.2122
#> Standard errors simple 0.1116 0.1321 0.2122
#> Truth -2.8546 -3.5848 -6.5119
#> cause1:spline4 cause1:traject:(Intercept)
#> Estimate AGHQ -4.9826 2.7890
#> Standard errors 0.1573 0.1048
#> Standard errors simple 0.1573 0.1048
#> Truth -4.9574 2.8655
#> cause1:traject:a cause1:traject:b cause2:spline1
#> Estimate AGHQ 0.78980 0.32069 -2.8896
#> Standard errors 0.05136 0.06089 0.2251
#> Standard errors simple 0.05136 0.06089 0.2250
#> Truth 0.80000 0.40000 -2.5969
#> cause2:spline2 cause2:spline3 cause2:spline4
#> Estimate AGHQ -3.3089 -6.2127 -4.8797
#> Standard errors 0.2291 0.4477 0.3179
#> Standard errors simple 0.2290 0.4473 0.3176
#> Truth -3.3416 -6.0232 -4.6611
#> cause2:traject:(Intercept) cause2:traject:a
#> Estimate AGHQ 3.3634 0.24684
#> Standard errors 0.2574 0.06955
#> Standard errors simple 0.2572 0.06954
#> Truth 3.1145 0.25000
#> cause2:traject:b
#> Estimate AGHQ -0.3477
#> Standard errors 0.1059
#> Standard errors simple 0.1059
#> Truth -0.2000
n_vcov <- (2L * n_causes * (2L * n_causes + 1L)) %/% 2L
Sigma
#> [,1] [,2] [,3] [,4]
#> [1,] 0.306 0.008 -0.138 0.197
#> [2,] 0.008 0.759 0.251 -0.250
#> [3,] -0.138 0.251 0.756 -0.319
#> [4,] 0.197 -0.250 -0.319 0.903
log_chol_inv(tail(fit$par, n_vcov))
#> [,1] [,2] [,3] [,4]
#> [1,] 0.38517 0.05046 -0.05658 0.1598
#> [2,] 0.05046 0.81841 0.24202 -0.4241
#> [3,] -0.05658 0.24202 0.68717 -0.2426
#> [4,] 0.15978 -0.42407 -0.24261 1.0616
# on the log Cholesky scale
rbind(`Estimate AGHQ` = fit$par[comp_obj$indices$vcov_upper],
`Standard errors` = SEs[comp_obj$indices$vcov_upper],
`Standard errors simple` = SEs_simple[comp_obj$indices$vcov_upper],
Truth = truth[comp_obj$indices$vcov_upper])
#> vcov:risk1:risk1 vcov:risk1:risk2 vcov:risk2:risk2
#> Estimate AGHQ -0.4770 0.08131 -0.1043
#> Standard errors 0.2079 0.23574 0.1577
#> Standard errors simple 0.2079 0.23573 0.1577
#> Truth -0.5921 0.01446 -0.1380
#> vcov:risk1:traject1 vcov:risk2:traject1
#> Estimate AGHQ -0.09116 0.2768
#> Standard errors 0.14510 0.1155
#> Standard errors simple 0.14510 0.1155
#> Truth -0.24947 0.2923
#> vcov:traject1:traject1 vcov:risk1:traject2
#> Estimate AGHQ -0.2536 0.2575
#> Standard errors 0.1064 0.2402
#> Standard errors simple 0.1064 0.2402
#> Truth -0.2485 0.3561
#> vcov:risk2:traject2 vcov:traject1:traject2
#> Estimate AGHQ -0.4939 -0.1062
#> Standard errors 0.2011 0.1501
#> Standard errors simple 0.2011 0.1501
#> Truth -0.2929 -0.1853
#> vcov:traject2:traject2
#> Estimate AGHQ -0.1505
#> Standard errors 0.1871
#> Standard errors simple 0.1870
#> Truth -0.2108
# on the original covariance matrix scale
vcov_est <- log_chol_inv(tail(fit$par, n_vcov))
vcov_est[lower.tri(vcov_est)] <- NA_real_
vcov_SE <- matrix(NA_real_, NROW(vcov_est), NCOL(vcov_est))
vcov_SE[upper.tri(vcov_SE, TRUE)] <-
attr(sandwich_est, "res vcov") |> diag() |> sqrt() |>
tail(n_vcov)
vcov_show <- cbind(Estimates = vcov_est, NA, SEs = vcov_SE)
colnames(vcov_show) <-
c(rep("Est.", NCOL(vcov_est)), "", rep("SE", NCOL(vcov_est)))
print(vcov_show, na.print = "")
#> Est. Est. Est. Est. SE SE SE SE
#> [1,] 0.3852 0.05046 -0.05658 0.1598 0.1602 0.1509 0.08933 0.1506
#> [2,] 0.81841 0.24202 -0.4241 0.2723 0.11034 0.1815
#> [3,] 0.68717 -0.2426 0.11579 0.1033
#> [4,] 1.0616 0.2819
Sigma # the true values
#> [,1] [,2] [,3] [,4]
#> [1,] 0.306 0.008 -0.138 0.197
#> [2,] 0.008 0.759 0.251 -0.250
#> [3,] -0.138 0.251 0.756 -0.319
#> [4,] 0.197 -0.250 -0.319 0.903
The mmcif_pd_univariate
function is provided to compute the marginal CIFs, the derivative of the marginal CIFs, and the marginal survival probability.
# compute the univariate marginal CIFs
ex_dat <- data.frame(a = c(-.5, .25), b = c(.1, .8))
compute_cif <- \(cause, time = .25){
mmcif_pd_univariate(
fit$par, comp_obj, newdata = ex_dat[1, ], cause = cause,
time = time, type = "cumulative")
}
m_cifs <- sapply(1:2, compute_cif)
m_cifs
#> [1] 0.21531 0.06461
# these match with the survival probability
m_surv <- compute_cif(3L)
all.equal(m_surv, 1 - sum(m_cifs))
#> [1] TRUE
# we can also get the derivative of the marginal CIFs
compute_dens <- \(cause, time = .25){
mmcif_pd_univariate(
fit$par, comp_obj, newdata = ex_dat[1, ], cause = cause,
time = time, type = "derivative")
}
compute_dens(2L)
#> [1] 0.1823
# they integrate to the CIFS
int_m_cifs <- sapply(
1:2, \(cause)
integrate(Vectorize(compute_dens), 1e-12, .25, rel.tol = 1e-10,
cause = cause)$value)
all.equal(int_m_cifs, m_cifs) # ~ the same
#> [1] "Mean relative difference: 1.106e-06"
# we can compute a marginal cause-specific hazard using these
local({
tis <- seq(1e-2, 2 - 1e-2, length.out = 200)
pd_wrap <- \(ti, type, cause)
mmcif_pd_univariate(
fit$par, comp_obj, newdata = ex_dat[1, ], cause = cause,
time = ti, type = type, use_log = TRUE)
log_m_survs <- sapply(tis, pd_wrap, type = "cumulative", cause = 3L)
log_dens <- sapply(tis, pd_wrap, type = "derivative", cause = 2L)
par(mar = c(5, 5, 1, 1))
hazs <- exp(log_dens - log_m_survs)
plot(tis, hazs, xlab = "Time", type = "l", bty = "l",
ylab = "Marginal hazard", ylim = range(0, hazs))
grid()
})
There is a bivariate version as well. This gives the marginal bivaraite CIFs, marginal survival probabilities, the marginal density, and mixtures thereof as illustrated below.
# wrapper to simplify calls to mmcif_pd_bivariate
mmcif_pd_bivariate_wrap <- \(time = c(.25, 1.33), cause, type){
mmcif_pd_bivariate(
fit$par, comp_obj, newdata = ex_dat, cause = cause,
time = time, ghq_data = ghq_lists[[2]], type = type)
}
# compute all configurations except for the double censored one
(cause_combs <- cbind(rep(1:3, each = 3), rep(1:3, 3)))
#> [,1] [,2]
#> [1,] 1 1
#> [2,] 1 2
#> [3,] 1 3
#> [4,] 2 1
#> [5,] 2 2
#> [6,] 2 3
#> [7,] 3 1
#> [8,] 3 2
#> [9,] 3 3
cause_combs <- cause_combs[rowSums(cause_combs) < 6L, ]
m_cifs <- apply(cause_combs, 1, \(cause){
mmcif_pd_bivariate_wrap(cause = cause, type = c("cumulative", "cumulative"))
})
m_cifs
#> [1] 0.09724 0.02519 0.09289 0.01221 0.02629 0.02612 0.21444 0.12867
# we can recover the probability of both being censored
m_surv <- mmcif_pd_bivariate_wrap(
cause = c(3L, 3L), type = c("cumulative", "cumulative"))
all.equal(m_surv, 1 - sum(m_cifs))
#> [1] TRUE
# we can also compute the derivative of a CIF in one argument and the CIF in the
# other
mmcif_pd_bivariate_wrap(cause = c(1L, 1L), type = c("derivative", "cumulative"))
#> [1] 0.07968
# we can also compute the derivative in both
mmcif_pd_bivariate_wrap(cause = c(1L, 1L), type = c("derivative", "derivative"))
#> [1] 0.03915
# they integrate numerically to roughly the right value
(combs_to_test <- cause_combs[!apply(cause_combs == 3, 1L, any), ])
#> [,1] [,2]
#> [1,] 1 1
#> [2,] 1 2
#> [3,] 2 1
#> [4,] 2 2
apply(combs_to_test, 1, \(cause){
# compute the CIF numerically and compute the relative error
f1 <- Vectorize(
\(x, cause1, type1){
mmcif_pd_bivariate(
fit$par, comp_obj, newdata = ex_dat, cause = c(cause1, cause[2]),
time = c(x, 1.33), ghq_data = ghq_lists[[2]],
type = c(type1, "cumulative"))
}, vectorize.args = "x")
int_val <- integrate(
f1, 1e-8, .25, cause1 = cause[1], type1 = "derivative",
rel.tol = 1e-10)$value
func_out <- f1(.25, cause[1], "cumulative")
err1 <- (func_out - int_val) / int_val
# do the same for the other argument
f2 <- Vectorize(
\(x, cause2, type2){
mmcif_pd_bivariate(
fit$par, comp_obj, newdata = ex_dat, cause = c(cause[1], cause2),
time = c(.25, x), ghq_data = ghq_lists[[2]],
type = c("cumulative", type2))
}, vectorize.args = "x")
int_val <- integrate(
f2, 1e-8, 1.33, cause2 = cause[2], type2 = "derivative",
rel.tol = 1e-10)$value
func_out <- f2(1.33, cause[2], "cumulative")
err2 <- (func_out - int_val) / int_val
# return the relative errors
c(err1, err2)
}) # ~ tiny relative errors
#> [,1] [,2] [,3] [,4]
#> [1,] 4.448e-07 2.96e-07 3.800e-08 -5.398e-07
#> [2,] 5.409e-08 4.85e-07 9.375e-08 -7.684e-07
# we can also do the check with one CIF and one derivative
apply(combs_to_test, 1, \(cause){
# compute the cif numerically
f1 <- Vectorize(
\(x, cause1, type1){
mmcif_pd_bivariate(
fit$par, comp_obj, newdata = ex_dat, cause = c(cause1, cause[2]),
time = c(x, 1.33), ghq_data = ghq_lists[[2]],
type = c(type1, "derivative"))
}, vectorize.args = "x")
int_val <- integrate(
f1, 1e-8, .25, cause1 = cause[1], type1 = "derivative", rel.tol = 1e-10)$value
func_out <- f1(.25, cause[1], "cumulative")
err1 <- (func_out - int_val) / int_val
# do the same for the other argument
f2 <- Vectorize(
\(x, cause2, type2){
mmcif_pd_bivariate(
fit$par, comp_obj, newdata = ex_dat, cause = c(cause[1], cause2),
time = c(.25, x), ghq_data = ghq_lists[[2]],
type = c("derivative", type2))
}, vectorize.args = "x")
int_val <- integrate(
f2, 1e-8, 1.33, cause2 = cause[2], type2 = "derivative",
rel.tol = 1e-10)$value
func_out <- f2(1.33, cause[2], "cumulative")
err2 <- (func_out - int_val) / int_val
# return the relative errors
c(err1, err2)
})
#> [,1] [,2] [,3] [,4]
#> [1,] 8.270e-08 3.573e-07 -7.029e-08 -1.120e-07
#> [2,] 8.151e-08 2.964e-07 1.580e-07 -3.787e-07
Finally, there is also function to compute the marginal density, CIF, survival probability or hazard conditional on the outcome of another individual.
# define two wrapper to simplify the calls to mmcif_pd_univariate and
# mmcif_pd_cond
compute_uncond <- Vectorize(
\(time, cause, type){
mmcif_pd_univariate(
fit$par, comp_obj, newdata = ex_dat[1, ], cause = cause,
time = time, type = type)
}, "time")
compute_cond <- Vectorize(
\(time, cause, type){
mmcif_pd_cond(
fit$par, comp_obj, newdata = ex_dat, cause = c(2L, cause),
time = c(.25, time), type_obs = type, type_cond = "cumulative",
which_cond = 1L)
}, "time")
# the conditional figures are the dashed curves
local({
tis <- seq(1e-2, 2 - 1e-2, length.out = 200)
cifs <- cbind(compute_uncond(tis, 2L, "cumulative"),
compute_cond(tis, 2L, "cumulative"))
derivs <- cbind(compute_uncond(tis, 2L, "derivative"),
compute_cond(tis, 2L, "derivative"))
par(mar = c(5, 5, 1, 1))
matplot(tis, cifs, type = "l", lty = 1:2, col = "black", bty = "l",
xlab = "Time", ylab = "Marginal CIF")
grid()
matplot(tis, derivs, type = "l", lty = 1:2, col = "black", bty = "l",
xlab = "Time", ylab = "Marginal derivatives")
grid()
# do the same for the hazard
hazs_cond <- compute_cond(tis, 2L, "hazard")
hazs_uncond <-
compute_uncond(tis, "derivative", cause = 2L) /
compute_uncond(tis, "cumulative", cause = 3L)
matplot(tis, cbind(hazs_uncond, hazs_cond), type = "l", lty = 1:2,
col = "black", bty = "l", xlab = "Time", ylab = "Marginal hazards")
grid()
})
# we can check a standard equality
cond_surv <- compute_cond(.25, cause = 3L, type = "cumulative")
cond_deriv <- compute_cond(.25, cause = 1L, type = "derivative")
cond_haz <- compute_cond(.25, cause = 1L, type = "hazard")
all.equal(cond_surv * cond_haz, cond_deriv)
#> [1] TRUE
# the conditional derivative integrates to the conditional CIF
num_cumulative <- integrate(
compute_cond, 1e-8, .25, cause = 1L, type = "derivative",
rel.tol = 1e-10)$value
all.equal(num_cumulative, compute_cond(.25, cause = 1L, type = "cumulative"))
#> [1] "Mean relative difference: 3.475e-07"
We extend the previous example to the setting where there may be delayed entry (left truncation). Thus, we assign a new simulation function. The delayed entry is sampled by sampling a random variable from the uniform distribution on -1 to 1 and taking the entry time as being the maximum of this variable and zero.
library(mvtnorm)
# simulates a data set with a given number of clusters and maximum number of
# observations per cluster
sim_dat <- \(n_clusters, max_cluster_size){
stopifnot(max_cluster_size > 0,
n_clusters > 0)
cluster_id <- 0L
replicate(n_clusters, simplify = FALSE, {
n_obs <- sample.int(max_cluster_size, 1L)
cluster_id <<- cluster_id + 1L
# draw the covariates and the left truncation time
covs <- cbind(a = rnorm(n_obs), b = runif(n_obs, -1))
Z <- cbind(1, covs)
delayed_entry <- pmax(runif(n_obs, -1), 0)
cens <- rep(-Inf, n_obs)
while(all(cens <= delayed_entry))
cens <- runif(n_obs, max = 3 * delta)
successful_sample <- FALSE
while(!successful_sample){
rng_effects <- rmvnorm(1, sigma = Sigma) |> drop()
U <- head(rng_effects, n_causes)
eta <- tail(rng_effects, n_causes)
# draw the cause
cond_logits_exp <- exp(Z %*% coef_risk + rep(U, each = n_obs)) |>
cbind(1)
cond_probs <- cond_logits_exp / rowSums(cond_logits_exp)
cause <- apply(cond_probs, 1,
\(prob) sample.int(n_causes + 1L, 1L, prob = prob))
# compute the observed time if needed
obs_time <- mapply(\(cause, idx){
if(cause > n_causes)
return(delta)
# can likely be done smarter but this is more general
coefs <- coef_traject[, cause]
offset <- sum(Z[idx, ] * coefs[-1]) + eta[cause]
rng <- runif(1)
eps <- .Machine$double.eps
root <- uniroot(
\(x) rng - pnorm(
-coefs[1] * atanh((x - delta / 2) / (delta / 2)) - offset),
c(eps^2, delta * (1 - eps)), tol = 1e-12)$root
}, cause, 1:n_obs)
keep <- which(pmin(obs_time, cens) > delayed_entry)
successful_sample <- length(keep) > 0
if(!successful_sample)
next
has_finite_trajectory_prob <- cause <= n_causes
is_censored <- which(!has_finite_trajectory_prob | cens < obs_time)
if(length(is_censored) > 0){
obs_time[is_censored] <- pmin(delta, cens[is_censored])
cause[is_censored] <- n_causes + 1L
}
}
data.frame(covs, cause, time = obs_time, cluster_id, delayed_entry)[keep, ]
}) |>
do.call(what = rbind)
}
We sample a data set using the new simulation function.
# sample a data set
set.seed(51312406)
n_clusters <- 1000L
max_cluster_size <- 5L
dat <- sim_dat(n_clusters, max_cluster_size = max_cluster_size)
# show some stats
NROW(dat) # number of individuals
#> [1] 2524
table(dat$cause) # distribution of causes (3 is censored)
#>
#> 1 2 3
#> 976 435 1113
# distribution of observed times by cause
tapply(dat$time, dat$cause, quantile,
probs = seq(0, 1, length.out = 11), na.rm = TRUE)
#> $`1`
#> 0% 10% 20% 30% 40% 50% 60% 70%
#> 1.389e-06 1.155e-02 6.302e-02 2.279e-01 5.696e-01 9.796e-01 1.312e+00 1.650e+00
#> 80% 90% 100%
#> 1.887e+00 1.981e+00 2.000e+00
#>
#> $`2`
#> 0% 10% 20% 30% 40% 50% 60% 70%
#> 0.002019 0.090197 0.280351 0.429229 0.658360 0.906824 1.180597 1.409366
#> 80% 90% 100%
#> 1.674830 1.877513 1.996200
#>
#> $`3`
#> 0% 10% 20% 30% 40% 50% 60% 70%
#> 0.005216 0.462201 0.836501 1.188546 1.599797 2.000000 2.000000 2.000000
#> 80% 90% 100%
#> 2.000000 2.000000 2.000000
# distribution of the left truncation time
quantile(dat$delayed_entry, probs = seq(0, 1, length.out = 11))
#> 0% 10% 20% 30% 40% 50% 60% 70%
#> 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 7.271e-05 2.151e-01
#> 80% 90% 100%
#> 4.463e-01 7.110e-01 9.990e-01
Next, we fit the model as before but this time we pass the delayed entry time.
library(mmcif)
comp_obj <- mmcif_data(
~ a + b, dat, cause = cause, time = time, cluster_id = cluster_id,
max_time = delta, spline_df = 4L, left_trunc = delayed_entry)
# we need to find the combination of the spline bases that yield a straight
# line to construct the true values using the splines. You can skip this
comb_slope <- sapply(comp_obj$spline, \(spline){
boundary_knots <- spline$boundary_knots
pts <- seq(boundary_knots[1], boundary_knots[2], length.out = 1000)
lm.fit(cbind(1, spline$expansion(pts)), pts)$coef
})
coef_traject_spline <-
rbind(comb_slope[-1, ] * rep(coef_traject[1, ], each = NROW(comb_slope) - 1),
coef_traject[2, ] + comb_slope[1, ] * coef_traject[1, ],
coef_traject[-(1:2), ])
# set true values
truth <- c(coef_risk, coef_traject_spline, log_chol(Sigma))
# find the starting values
system.time(start <- mmcif_start_values(comp_obj, n_threads = 4L))
#> user system elapsed
#> 0.056 0.000 0.017
# we can verify that the gradient is correct again
gr_package <- mmcif_logLik_grad(
comp_obj, truth, n_threads = 4L, is_log_chol = TRUE)
gr_num <- numDeriv::grad(
mmcif_logLik, truth, object = comp_obj, n_threads = 4L, is_log_chol = TRUE,
method = "simple")
rbind(`Numerical gradient` = gr_num, `Gradient package` = gr_package)
#> [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9]
#> Numerical gradient -47.71 -8.791 6.978 7.570 7.152 6.220 5.934 8.550 -28.05
#> Gradient package -47.65 -8.753 6.991 7.571 7.179 6.233 5.957 8.573 -28.04
#> [,10] [,11] [,12] [,13] [,14] [,15] [,16] [,17] [,18]
#> Numerical gradient 18.37 -47.03 86.44 2.075 -45.32 -17.03 13.93 17.29 -20.57
#> Gradient package 18.38 -46.99 86.50 2.098 -45.31 -17.02 13.93 17.30 -20.55
#> [,19] [,20] [,21] [,22] [,23] [,24] [,25] [,26] [,27]
#> Numerical gradient 20.15 -1.487 6.760 -5.759 -2.593 -14.53 20.44 -9.739 5.922
#> Gradient package 20.18 -1.479 6.753 -5.687 -2.036 -14.53 20.37 -9.701 5.931
#> [,28] [,29] [,30]
#> Numerical gradient -10.99 -14.59 4.312
#> Gradient package -10.97 -14.59 4.324
# optimize the log composite likelihood
system.time(fit <- mmcif_fit(start$upper, comp_obj, n_threads = 4L))
#> user system elapsed
#> 49.872 0.012 12.473
# the log composite likelihood at different points
mmcif_logLik(comp_obj, truth, n_threads = 4L, is_log_chol = TRUE)
#> [1] -4745
mmcif_logLik(comp_obj, start$upper, n_threads = 4L, is_log_chol = TRUE)
#> [1] -5077
-fit$value
#> [1] -4724
Then we compute the sandwich estimator. The Hessian is currently computed with numerical differentiation which is why it takes a while.
system.time(sandwich_est <- mmcif_sandwich(comp_obj, fit$par, n_threads = 4L))
#> user system elapsed
#> 41.487 0.004 10.394
# setting order equal to zero yield no Richardson extrapolation and just
# standard symmetric difference quotient. This is less precise but faster
system.time(sandwich_est_simple <-
mmcif_sandwich(comp_obj, fit$par, n_threads = 4L, order = 0L))
#> user system elapsed
#> 9.521 0.000 2.386
We show the estimated and true the conditional cumulative incidence functions (the dashed curves are the estimates) when the random effects are zero and the covariates are zero, \(a_{ij} = b_{ij} = 0\).
local({
# get the estimates
coef_risk_est <- fit$par[comp_obj$indices$coef_risk] |>
matrix(ncol = n_causes)
coef_traject_time_est <- fit$par[comp_obj$indices$coef_trajectory_time] |>
matrix(ncol = n_causes)
coef_traject_est <- fit$par[comp_obj$indices$coef_trajectory] |>
matrix(ncol = n_causes)
coef_traject_intercept_est <- coef_traject_est[5, ]
# compute the risk probabilities
probs <- exp(coef_risk[1, ]) / (1 + sum(exp(coef_risk[1, ])))
probs_est <- exp(coef_risk_est[1, ]) / (1 + sum(exp(coef_risk_est[1, ])))
# plot the estimated and true conditional cumulative incidence functions. The
# estimates are the dashed lines
par(mar = c(5, 5, 1, 1), mfcol = c(1, 2))
pts <- seq(1e-8, delta * (1 - 1e-8), length.out = 1000)
for(i in 1:2){
true_vals <- probs[i] * pnorm(
-coef_traject[1, i] * atanh((pts - delta / 2) / (delta / 2)) -
coef_traject[2, i])
estimates <- probs_est[i] * pnorm(
-comp_obj$time_expansion(pts, cause = i) %*% coef_traject_time_est[, i] -
coef_traject_intercept_est[i]) |> drop()
matplot(pts, cbind(true_vals, estimates), xlim = c(0, delta),
ylim = c(0, 1), bty = "l", xlab = "Time", lty = c(1, 2),
ylab = sprintf("Cumulative incidence; cause %d", i),
yaxs = "i", xaxs = "i", type = "l", col = "black")
grid()
}
})
Further illustrations of the estimated model are given below.
# the number of call we made
fit$counts
#> function gradient
#> 232 190
fit$outer.iterations
#> [1] 3
# compute the standard errors from the sandwich estimator
SEs <- diag(sandwich_est) |> sqrt()
SEs_simple <- diag(sandwich_est_simple) |> sqrt()
# compare the estimates with the true values
rbind(`Estimate AGHQ` = fit$par[comp_obj$indices$coef_risk],
`Standard errors` = SEs[comp_obj$indices$coef_risk],
`Standard errors simple` = SEs_simple[comp_obj$indices$coef_risk],
Truth = truth[comp_obj$indices$coef_risk])
#> cause1:risk:(Intercept) cause1:risk:a cause1:risk:b
#> Estimate AGHQ 0.57747 0.98262 0.1391
#> Standard errors 0.07592 0.08423 0.1053
#> Standard errors simple 0.07592 0.08423 0.1053
#> Truth 0.67000 1.00000 0.1000
#> cause2:risk:(Intercept) cause2:risk:a cause2:risk:b
#> Estimate AGHQ -0.4139 0.23007 0.3440
#> Standard errors 0.1033 0.07872 0.1175
#> Standard errors simple 0.1033 0.07872 0.1175
#> Truth -0.4000 0.25000 0.3000
rbind(`Estimate AGHQ` = fit$par[comp_obj$indices$coef_trajectory],
`Standard errors` = SEs[comp_obj$indices$coef_trajectory],
`Standard errors simple` = SEs_simple[comp_obj$indices$coef_trajectory],
Truth = truth[comp_obj$indices$coef_trajectory])
#> cause1:spline1 cause1:spline2 cause1:spline3
#> Estimate AGHQ -2.9825 -3.625 -6.6752
#> Standard errors 0.1641 0.165 0.3374
#> Standard errors simple 0.1641 0.165 0.3373
#> Truth -3.0513 -3.666 -6.6720
#> cause1:spline4 cause1:traject:(Intercept)
#> Estimate AGHQ -4.7854 2.5959
#> Standard errors 0.2232 0.1503
#> Standard errors simple 0.2231 0.1503
#> Truth -4.8560 2.6778
#> cause1:traject:a cause1:traject:b cause2:spline1
#> Estimate AGHQ 0.88400 0.40159 -2.6765
#> Standard errors 0.06576 0.07497 0.2108
#> Standard errors simple 0.06575 0.07497 0.2109
#> Truth 0.80000 0.40000 -2.7771
#> cause2:spline2 cause2:spline3 cause2:spline4
#> Estimate AGHQ -3.1360 -5.6399 -4.1479
#> Standard errors 0.1890 0.4011 0.2565
#> Standard errors simple 0.1892 0.4010 0.2565
#> Truth -3.3481 -6.2334 -4.6450
#> cause2:traject:(Intercept) cause2:traject:a
#> Estimate AGHQ 2.6923 0.24584
#> Standard errors 0.2251 0.06472
#> Standard errors simple 0.2251 0.06472
#> Truth 3.0259 0.25000
#> cause2:traject:b
#> Estimate AGHQ -0.1689
#> Standard errors 0.1198
#> Standard errors simple 0.1198
#> Truth -0.2000
n_vcov <- (2L * n_causes * (2L * n_causes + 1L)) %/% 2L
Sigma
#> [,1] [,2] [,3] [,4]
#> [1,] 0.306 0.008 -0.138 0.197
#> [2,] 0.008 0.759 0.251 -0.250
#> [3,] -0.138 0.251 0.756 -0.319
#> [4,] 0.197 -0.250 -0.319 0.903
log_chol_inv(tail(fit$par, n_vcov))
#> [,1] [,2] [,3] [,4]
#> [1,] 0.33651 -0.1471 -0.07113 0.1426
#> [2,] -0.14711 0.3690 0.41898 -0.1071
#> [3,] -0.07113 0.4190 0.72053 -0.4198
#> [4,] 0.14263 -0.1071 -0.41981 0.5897
# on the log Cholesky scale
rbind(`Estimate AGHQ` = fit$par[comp_obj$indices$vcov_upper],
`Standard errors` = SEs[comp_obj$indices$vcov_upper],
`Standard errors simple` = SEs_simple[comp_obj$indices$vcov_upper],
Truth = truth[comp_obj$indices$vcov_upper])
#> vcov:risk1:risk1 vcov:risk1:risk2 vcov:risk2:risk2
#> Estimate AGHQ -0.5446 -0.25359 -0.5942
#> Standard errors 0.2753 0.22883 0.3426
#> Standard errors simple 0.2753 0.22875 0.3423
#> Truth -0.5921 0.01446 -0.1380
#> vcov:risk1:traject1 vcov:risk2:traject1
#> Estimate AGHQ -0.1226 0.7027
#> Standard errors 0.1953 0.1763
#> Standard errors simple 0.1953 0.1761
#> Truth -0.2495 0.2923
#> vcov:traject1:traject1 vcov:risk1:traject2
#> Estimate AGHQ -0.7763 0.2459
#> Standard errors 0.5252 0.2157
#> Standard errors simple 0.5238 0.2156
#> Truth -0.2485 0.3561
#> vcov:risk2:traject2 vcov:traject1:traject2
#> Estimate AGHQ -0.08115 -0.7230
#> Standard errors 0.28621 0.1277
#> Standard errors simple 0.28583 0.1277
#> Truth -0.29291 -0.1853
#> vcov:traject2:traject2
#> Estimate AGHQ -6.7371
#> Standard errors 1.5913
#> Standard errors simple 1.5629
#> Truth -0.2108
# on the original covariance matrix scale
vcov_est <- log_chol_inv(tail(fit$par, n_vcov))
vcov_est[lower.tri(vcov_est)] <- NA_real_
vcov_SE <- matrix(NA_real_, NROW(vcov_est), NCOL(vcov_est))
vcov_SE[upper.tri(vcov_SE, TRUE)] <-
attr(sandwich_est, "res vcov") |> diag() |> sqrt() |>
tail(n_vcov)
vcov_show <- cbind(Estimates = vcov_est, NA, SEs = vcov_SE)
colnames(vcov_show) <-
c(rep("Est.", NCOL(vcov_est)), "", rep("SE", NCOL(vcov_est)))
print(vcov_show, na.print = "")
#> Est. Est. Est. Est. SE SE SE SE
#> [1,] 0.3365 -0.1471 -0.07113 0.1426 0.1853 0.1173 0.1135 0.1319
#> [2,] 0.3690 0.41898 -0.1071 0.1661 0.1142 0.1396
#> [3,] 0.72053 -0.4198 0.1473 0.1278
#> [4,] 0.5897 0.1845
Sigma # the true values
#> [,1] [,2] [,3] [,4]
#> [1,] 0.306 0.008 -0.138 0.197
#> [2,] 0.008 0.759 0.251 -0.250
#> [3,] -0.138 0.251 0.756 -0.319
#> [4,] 0.197 -0.250 -0.319 0.903
We may allow for different transformations for groups of individuals. Specifically, we can replace the covariates for the trajectory
\[\vec x_{ij}(t) = (\vec h(t)^\top, \vec z_{ij}^\top)^\top\]
with
\[\vec x_{ij}(t) = (\vec h_{l_{ij}}(t)^\top, \vec z_{ij}^\top)^\top\]
where there are \(L\) strata each having their own spline basis \(h_{l}(t)\) and \(l_{ij}\) is the strata that observation \(j\) in cluster \(i\) is in. This is supported in the package using the strata
argument of mmcif_data
. We illustrate this by extending the previous example. First, we assign new model parameters and plot the cumulative incidence functions as before but for each strata.
# assign model parameters
n_causes <- 2L
delta <- 2
# set the betas
coef_risk <- c(.9, 1, .1, -.2, .5, 0, 0, 0, .5,
-.4, .25, .3, 0, .5, .25, 1.5, -.25, 0) |>
matrix(ncol = n_causes)
# set the gammas
coef_traject <- c(-.8, -.45, -1, -.1, -.5, -.4,
.8, .4, 0, .4, 0, .4,
-1.2, .15, -.4, -.15, -.5, -.25,
.25, -.2, 0, -.2, .25, 0) |>
matrix(ncol = n_causes)
# plot the conditional cumulative incidence functions when random effects and
# covariates are all zero
local({
for(strata in 1:3 - 1L){
probs <- exp(coef_risk[1 + strata * 3, ]) /
(1 + sum(exp(coef_risk[1 + strata * 3, ])))
par(mar = c(5, 5, 1, 1), mfcol = c(1, 2))
for(i in 1:2){
plot(\(x) probs[i] * pnorm(
-coef_traject[1 + strata * 2, i] *
atanh((x - delta / 2) / (delta / 2)) -
coef_traject[2 + strata * 2, i]),
xlim = c(0, delta), ylim = c(0, 1), bty = "l",
xlab = sprintf("Time; strata %d", strata + 1L),
ylab = sprintf("Cumulative incidence; cause %d", i),
yaxs = "i", xaxs = "i")
grid()
}
}
})
# the probabilities of each strata
strata_prob <- c(.2, .5, .3)
# set the covariance matrix
Sigma <- c(0.306, 0.008, -0.138, 0.197, 0.008, 0.759, 0.251,
-0.25, -0.138, 0.251, 0.756, -0.319, 0.197, -0.25, -0.319, 0.903) |>
matrix(2L * n_causes)
Then we define a simulation function.
library(mvtnorm)
# simulates a data set with a given number of clusters and maximum number of
# observations per cluster
sim_dat <- \(n_clusters, max_cluster_size){
stopifnot(max_cluster_size > 0,
n_clusters > 0)
cluster_id <- 0L
replicate(n_clusters, simplify = FALSE, {
n_obs <- sample.int(max_cluster_size, 1L)
cluster_id <<- cluster_id + 1L
strata <- sample.int(length(strata_prob), 1L, prob = strata_prob)
# keep only the relevant parameters
coef_risk <- coef_risk[1:3 + (strata - 1L) * 3, ]
coef_traject <- coef_traject[
c(1:2 + (strata - 1L) * 2L, 1:2 + 6L + (strata - 1L) * 2L), ]
# draw the covariates and the left truncation time
covs <- cbind(a = rnorm(n_obs), b = runif(n_obs, -1))
Z <- cbind(1, covs)
delayed_entry <- pmax(runif(n_obs, -1), 0)
cens <- rep(-Inf, n_obs)
while(all(cens <= delayed_entry))
cens <- runif(n_obs, max = 3 * delta)
successful_sample <- FALSE
while(!successful_sample){
rng_effects <- rmvnorm(1, sigma = Sigma) |> drop()
U <- head(rng_effects, n_causes)
eta <- tail(rng_effects, n_causes)
# draw the cause
cond_logits_exp <- exp(Z %*% coef_risk + rep(U, each = n_obs)) |>
cbind(1)
cond_probs <- cond_logits_exp / rowSums(cond_logits_exp)
cause <- apply(cond_probs, 1,
\(prob) sample.int(n_causes + 1L, 1L, prob = prob))
# compute the observed time if needed
obs_time <- mapply(\(cause, idx){
if(cause > n_causes)
return(delta)
# can likely be done smarter but this is more general
coefs <- coef_traject[, cause]
offset <- sum(Z[idx, ] * coefs[-1]) + eta[cause]
rng <- runif(1)
eps <- .Machine$double.eps
root <- uniroot(
\(x) rng - pnorm(
-coefs[1] * atanh((x - delta / 2) / (delta / 2)) - offset),
c(eps^2, delta * (1 - eps)), tol = 1e-12)$root
}, cause, 1:n_obs)
keep <- which(pmin(obs_time, cens) > delayed_entry)
successful_sample <- length(keep) > 0
if(!successful_sample)
next
has_finite_trajectory_prob <- cause <= n_causes
is_censored <- which(!has_finite_trajectory_prob | cens < obs_time)
if(length(is_censored) > 0){
obs_time[is_censored] <- pmin(delta, cens[is_censored])
cause[is_censored] <- n_causes + 1L
}
}
data.frame(covs, cause, time = obs_time, cluster_id, delayed_entry,
strata)[keep, ]
}) |>
do.call(what = rbind) |>
transform(strata = factor(sprintf("s%d", strata)))
}
We sample a data set using the new simulation function.
# sample a data set
set.seed(14712915)
n_clusters <- 1000L
max_cluster_size <- 5L
dat <- sim_dat(n_clusters, max_cluster_size = max_cluster_size)
# show some stats
NROW(dat) # number of individuals
#> [1] 2518
table(dat$cause) # distribution of causes (3 is censored)
#>
#> 1 2 3
#> 639 791 1088
# distribution of observed times by cause
tapply(dat$time, dat$cause, quantile,
probs = seq(0, 1, length.out = 11), na.rm = TRUE)
#> $`1`
#> 0% 10% 20% 30% 40% 50% 60% 70%
#> 2.491e-06 2.254e-02 1.257e-01 3.135e-01 6.053e-01 9.441e-01 1.229e+00 1.553e+00
#> 80% 90% 100%
#> 1.809e+00 1.949e+00 2.000e+00
#>
#> $`2`
#> 0% 10% 20% 30% 40% 50% 60% 70%
#> 2.161e-10 1.023e-03 1.815e-02 1.198e-01 3.706e-01 8.523e-01 1.432e+00 1.802e+00
#> 80% 90% 100%
#> 1.959e+00 1.998e+00 2.000e+00
#>
#> $`3`
#> 0% 10% 20% 30% 40% 50% 60% 70%
#> 0.0001885 0.4570257 0.8623890 1.2217385 1.6003899 2.0000000 2.0000000 2.0000000
#> 80% 90% 100%
#> 2.0000000 2.0000000 2.0000000
# within strata
tapply(dat$time, interaction(dat$cause, dat$strata), quantile,
probs = seq(0, 1, length.out = 11), na.rm = TRUE)
#> $`1.s1`
#> 0% 10% 20% 30% 40% 50% 60% 70%
#> 0.0001022 0.0239065 0.1246552 0.3120237 0.6869977 0.8928432 1.2835877 1.6481640
#> 80% 90% 100%
#> 1.9030874 1.9707153 1.9998886
#>
#> $`2.s1`
#> 0% 10% 20% 30% 40% 50% 60% 70%
#> 0.006063 0.092698 0.286631 0.468654 0.698025 0.919033 1.131461 1.396541
#> 80% 90% 100%
#> 1.707284 1.868491 1.977970
#>
#> $`3.s1`
#> 0% 10% 20% 30% 40% 50% 60% 70%
#> 0.004763 0.567089 0.903437 1.294739 1.601904 2.000000 2.000000 2.000000
#> 80% 90% 100%
#> 2.000000 2.000000 2.000000
#>
#> $`1.s2`
#> 0% 10% 20% 30% 40% 50% 60% 70%
#> 0.001513 0.062654 0.194127 0.367494 0.598768 0.971182 1.203544 1.424611
#> 80% 90% 100%
#> 1.665483 1.868098 1.995039
#>
#> $`2.s2`
#> 0% 10% 20% 30% 40% 50% 60% 70%
#> 2.161e-10 2.177e-04 5.389e-03 4.520e-02 2.077e-01 6.665e-01 1.506e+00 1.866e+00
#> 80% 90% 100%
#> 1.979e+00 1.999e+00 2.000e+00
#>
#> $`3.s2`
#> 0% 10% 20% 30% 40% 50% 60% 70%
#> 0.008548 0.474137 0.889173 1.314717 1.692051 2.000000 2.000000 2.000000
#> 80% 90% 100%
#> 2.000000 2.000000 2.000000
#>
#> $`1.s3`
#> 0% 10% 20% 30% 40% 50% 60% 70%
#> 2.491e-06 1.177e-03 1.072e-02 5.403e-02 4.041e-01 9.582e-01 1.250e+00 1.783e+00
#> 80% 90% 100%
#> 1.974e+00 1.994e+00 2.000e+00
#>
#> $`2.s3`
#> 0% 10% 20% 30% 40% 50% 60% 70%
#> 7.133e-07 1.678e-03 1.950e-02 1.115e-01 4.179e-01 9.793e-01 1.575e+00 1.855e+00
#> 80% 90% 100%
#> 1.970e+00 1.997e+00 2.000e+00
#>
#> $`3.s3`
#> 0% 10% 20% 30% 40% 50% 60% 70%
#> 0.0001885 0.4129234 0.6988777 1.0541778 1.3786664 1.7758420 2.0000000 2.0000000
#> 80% 90% 100%
#> 2.0000000 2.0000000 2.0000000
# distribution of strata
table(dat$strata)
#>
#> s1 s2 s3
#> 577 1233 708
# distribution of the left truncation time
quantile(dat$delayed_entry, probs = seq(0, 1, length.out = 11))
#> 0% 10% 20% 30% 40% 50% 60% 70% 80% 90% 100%
#> 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.1752 0.4210 0.6772 0.9998
Next, we fit the model as before but this time we with strata specific fixed effects and transformations.
library(mmcif)
comp_obj <- mmcif_data(
~ strata + (a + b) : strata - 1, dat, cause = cause, time = time,
cluster_id = cluster_id, max_time = delta, spline_df = 4L,
left_trunc = delayed_entry, strata = strata)
# we need to find the combination of the spline bases that yield a straight
# line to construct the true values using the splines. You can skip this
comb_slope <- sapply(comp_obj$spline, \(spline){
boundary_knots <- spline$boundary_knots
pts <- seq(boundary_knots[1], boundary_knots[2], length.out = 1000)
lm.fit(cbind(1, spline$expansion(pts)), pts)$coef
})
coef_traject_spline <- lapply(1:length(unique(dat$strata)), function(strata){
slopes <- coef_traject[1 + (strata - 1) * 2, ]
comb_slope[-1, ] * rep(slopes, each = NROW(comb_slope) - 1)
}) |>
do.call(what = rbind)
coef_traject_spline_fixef <- lapply(1:length(unique(dat$strata)), function(strata){
slopes <- coef_traject[1 + (strata - 1) * 2, ]
intercepts <- coef_traject[2 + (strata - 1) * 2, ]
fixefs <- coef_traject[7:8 + (strata - 1) * 2, ]
rbind(intercepts + comb_slope[1, ] * slopes,
fixefs)
}) |>
do.call(what = rbind)
coef_traject_spline <- rbind(coef_traject_spline, coef_traject_spline_fixef)
# handle that model.matrix in mmcif_data gives a different permutation of
# the parameters
permu <- c(seq(1, 7, by = 3), seq(2, 8, by = 3), seq(3, 9, by = 3))
# set true values
truth <- c(coef_risk[permu, ],
coef_traject_spline[c(1:12, permu + 12L), ],
log_chol(Sigma))
# find the starting values
system.time(start <- mmcif_start_values(comp_obj, n_threads = 4L))
#> user system elapsed
#> 0.244 0.000 0.067
# we can verify that the gradient is correct again
gr_package <- mmcif_logLik_grad(
comp_obj, truth, n_threads = 4L, is_log_chol = TRUE)
gr_num <- numDeriv::grad(
mmcif_logLik, truth, object = comp_obj, n_threads = 4L, is_log_chol = TRUE,
method = "simple")
rbind(`Numerical gradient` = gr_num, `Gradient package` = gr_package)
#> [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9]
#> Numerical gradient -27.70 0.5557 3.167 -4.275 -7.888 20.91 -15.67 -9.176 -22.88
#> Gradient package -27.69 0.5710 3.177 -4.267 -7.870 20.92 -15.66 -9.169 -22.87
#> [,10] [,11] [,12] [,13] [,14] [,15] [,16] [,17] [,18]
#> Numerical gradient 12.02 7.579 -9.007 10.10 24.54 -36.43 23.52 -11.57 18.89
#> Gradient package 12.02 7.605 -9.003 10.11 24.56 -36.42 23.52 -11.56 18.90
#> [,19] [,20] [,21] [,22] [,23] [,24] [,25] [,26] [,27]
#> Numerical gradient -1.542 12.25 10.04 -16.47 15.99 -18.4 -10.09 -3.850 15.52
#> Gradient package -1.535 12.26 10.04 -16.47 16.00 -18.4 -10.09 -3.849 15.52
#> [,28] [,29] [,30] [,31] [,32] [,33] [,34] [,35] [,36]
#> Numerical gradient 10.30 9.963 0.2325 25.19 -16.82 48.57 10.18 -9.259 7.031
#> Gradient package 10.31 9.965 0.2378 25.21 -16.80 48.57 10.20 -9.239 7.035
#> [,37] [,38] [,39] [,40] [,41] [,42] [,43] [,44] [,45] [,46]
#> Numerical gradient 2.899 2.786 7.047 6.840 6.110 2.291 -2.23 -28.18 37.46 4.682
#> Gradient package 2.904 2.793 7.049 6.842 6.111 2.291 -2.23 -28.17 37.47 4.686
#> [,47] [,48] [,49] [,50] [,51] [,52] [,53] [,54] [,55]
#> Numerical gradient -28.41 27.85 15.31 -1.697 -0.3346 13.87 -4.902 42.64 23
#> Gradient package -28.40 27.86 15.32 -1.693 -0.3308 13.87 -4.888 42.66 23
#> [,56] [,57] [,58] [,59] [,60] [,61] [,62] [,63] [,64]
#> Numerical gradient 47.49 -29.17 13.91 26.06 -8.109 -0.2252 12.53 -5.330 25.06
#> Gradient package 47.52 -29.14 13.92 26.07 -8.102 -0.2216 12.57 -5.194 25.06
#> [,65] [,66] [,67] [,68] [,69] [,70]
#> Numerical gradient -29.12 -10.13 -0.9177 19.73 -5.218 17.82
#> Gradient package -29.14 -10.11 -0.9119 19.72 -5.214 17.84
# optimize the log composite likelihood
system.time(fit <- mmcif_fit(start$upper, comp_obj, n_threads = 4L))
#> user system elapsed
#> 53.143 0.004 13.288
# the log composite likelihood at different points
mmcif_logLik(comp_obj, truth, n_threads = 4L, is_log_chol = TRUE)
#> [1] -3188
mmcif_logLik(comp_obj, start$upper, n_threads = 4L, is_log_chol = TRUE)
#> [1] -3454
-fit$value
#> [1] -3113
Then we compute the sandwich estimator. The Hessian is currently computed with numerical differentiation which is why it takes a while.
system.time(sandwich_est <- mmcif_sandwich(comp_obj, fit$par, n_threads = 4L))
#> user system elapsed
#> 83.819 0.007 20.975
# setting order equal to zero yield no Richardson extrapolation and just
# standard symmetric difference quotient. This is less precise but faster
system.time(sandwich_est_simple <-
mmcif_sandwich(comp_obj, fit$par, n_threads = 4L, order = 0L))
#> user system elapsed
#> 19.09 0.00 4.80
We show the estimated and true the conditional cumulative incidence functions (the dashed curves are the estimates) when the random effects are zero and the covariates are zero, \(a_{ij} = b_{ij} = 0\).
local({
# get the estimates
coef_risk_est <- fit$par[comp_obj$indices$coef_risk] |>
matrix(ncol = n_causes)
coef_traject_time_est <- fit$par[comp_obj$indices$coef_trajectory_time] |>
matrix(ncol = n_causes)
coef_traject_est <- fit$par[comp_obj$indices$coef_trajectory] |>
matrix(ncol = n_causes)
for(strata in 1:3 - 1L){
# compute the risk probabilities
probs <- exp(coef_risk[1 + strata * 3, ]) /
(1 + sum(exp(coef_risk[1 + strata * 3, ])))
probs_est <- exp(coef_risk_est[1 + strata, ]) /
(1 + sum(exp(coef_risk_est[1 + strata, ])))
# plot the estimated and true conditional cumulative incidence functions. The
# estimates are the dashed lines
par(mar = c(5, 5, 1, 1), mfcol = c(1, 2))
pts <- seq(1e-8, delta * (1 - 1e-8), length.out = 1000)
for(i in 1:2){
true_vals <- probs[i] * pnorm(
-coef_traject[1 + strata * 2, i] *
atanh((pts - delta / 2) / (delta / 2)) -
coef_traject[2 + strata * 2, i])
estimates <- probs_est[i] * pnorm(
-comp_obj$time_expansion(pts, cause = i, which_strata = strata + 1L) %*%
coef_traject_time_est[, i] -
coef_traject_est[13 + strata, i]) |> drop()
matplot(pts, cbind(true_vals, estimates), xlim = c(0, delta),
ylim = c(0, 1), bty = "l",
xlab = sprintf("Time; strata %d", strata + 1L), lty = c(1, 2),
ylab = sprintf("Cumulative incidence; cause %d", i),
yaxs = "i", xaxs = "i", type = "l", col = "black")
grid()
}
}
})