--- title: "Getting started with bigPLScox" shorttitle: "Getting started with bigPLScox" author: - name: "Frédéric Bertrand" affiliation: - Cedric, Cnam, Paris email: frederic.bertrand@lecnam.net date: "`r Sys.Date()`" output: rmarkdown::html_vignette: toc: true vignette: > %\VignetteIndexEntry{Getting started with bigPLScox} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r setup, include = FALSE} knitr::opts_chunk$set( collapse = TRUE, comment = "#>", fig.path = "figures/getting-started-", fig.width = 7, fig.height = 4.5, dpi = 150, message = FALSE, warning = FALSE ) ``` # Introduction The **bigPLScox** package provides tools to fit Partial Least Squares (PLS) models tailored to the Cox proportional hazards setting. It includes cross-validation helpers, diagnostic utilities, and interfaces that work with both in-memory matrices and `bigmemory` objects. This vignette walks through a core workflow on the allelotyping dataset bundled with the package. # Loading the data ```{r load-data} library(bigPLScox) data(micro.censure) data(Xmicro.censure_compl_imp) Y_all <- micro.censure$survyear[1:80] status_all <- micro.censure$DC[1:80] X_all <- apply( as.matrix(Xmicro.censure_compl_imp), MARGIN = 2, FUN = as.numeric )[1:80, ] set.seed(2024) train_id <- 1:60 test_id <- 61:80 X_train <- X_all[train_id, ] X_test <- X_all[test_id, ] Y_train <- Y_all[train_id] Y_test <- Y_all[test_id] status_train <- status_all[train_id] status_test <- status_all[test_id] ``` The original factor-based design matrix is also available should you prefer to work with categorical predictors directly. ```{r original-design} X_train_raw <- Xmicro.censure_compl_imp[train_id, ] X_test_raw <- Xmicro.censure_compl_imp[test_id, ] ``` # Exploring deviance residuals `computeDR()` extracts deviance residuals from a null Cox model. Inspecting them before fitting the PLS components helps detect anomalies in the survival outcomes. ```{r deviance-residuals} residuals_overview <- computeDR(Y_train, status_train, plot = TRUE) eta_null <- rep(0, length(Y_train)) head(residuals_overview) if (requireNamespace("bench", quietly = TRUE)) { benchmark_dr <- bench::mark( survival = computeDR(Y_train, status_train, engine = "survival"), cpp = computeDR(Y_train, status_train, engine = "cpp", eta = eta_null), iterations = 10, check = FALSE ) benchmark_dr } all.equal( as.numeric(computeDR(Y_train, status_train, engine = "survival")), as.numeric(computeDR(Y_train, status_train, engine = "cpp", eta = eta_null)), tolerance = 1e-7 ) ``` # Fitting Cox PLS models The matrix interface to `coxgpls()` mirrors `survival::coxph()` while adding latent components that stabilise estimation in high dimensions. ```{r fit-coxgpls} set.seed(123) cox_pls_fit <- coxgpls( Xplan = X_train, time = Y_train, status = status_train, ncomp = 6, ind.block.x = c(3, 10, 20) ) cox_pls_fit ``` A formula interface is also available when working with data frames that include both predictors and survival outcomes. ```{r fit-formula} cox_pls_fit_formula <- coxgpls( ~ ., Y_train, status_train, ncomp = 6, ind.block.x = c(3, 10, 20), dataXplan = data.frame(X_train_raw) ) cox_pls_fit_formula ``` # Cross-validation utilities Repeated cross-validation helps determine the appropriate number of latent components. Provide either a matrix or a list with `x`, `time`, and `status` entries. ```{r cv-coxgpls} set.seed(123456) cv_results <- suppressWarnings(cv.coxgpls( list(x = X_train, time = Y_train, status = status_train), nt = 6, ind.block.x = c(3, 10, 20) )) cv_results ``` The selected number of components is stored under `cv_results$opt_nt`. Use it to refit the model with the deviance residual solver for comparison. ```{r fit-coxgplsdr} set.seed(123456) cox_pls_dr <- coxgplsDR( Xplan = X_train, time = Y_train, status = status_train, ncomp = cv_results$nt, ind.block.x = c(3, 10, 20) ) cox_pls_dr ``` # DK-splines extension For flexible baseline hazards the `coxDKgplsDR()` estimator augments the PLS components with DK-splines. The interface mirrors the previous functions. ```{r dk-splines} cox_DKsplsDR_fit <- coxDKgplsDR( Xplan = X_train, time = Y_train, status = status_train, ncomp = 6, validation = "CV", ind.block.x = c(3, 10, 20), verbose = FALSE ) cox_DKsplsDR_fit ``` Cross-validation is available for the DK-splines estimator as well. ```{r cv-dk-splines} set.seed(2468) cv_coxDKgplsDR_res <- suppressWarnings(cv.coxDKgplsDR( list(x = X_train, time = Y_train, status = status_train), nt = 6, ind.block.x = c(3, 10, 20) )) cv_coxDKgplsDR_res # Unified prediction comparison if (requireNamespace("bigmemory", quietly = TRUE)) { library(bigmemory) X_big_train <- bigmemory::as.big.matrix(X_train) X_big_test <- bigmemory::as.big.matrix(X_test) big_fit <- big_pls_cox(X_big_train, Y_train, status_train, ncomp = 4) gd_fit <- big_pls_cox_gd(X_big_train, Y_train, status_train, ncomp = 4, max_iter = 200) risk_table <- data.frame( subject = seq_along(test_id), big_pls = predict(big_fit, newdata = X_big_test, type = "link", comps = 1:4), big_pls_gd = predict(gd_fit, newdata = X_big_test, type = "link", comps = 1:4) ) if (requireNamespace("plsRcox", quietly = TRUE)) { pls_fit <- try(plsRcox::plsRcox(Y_train, status_train, X_train_raw, nt = 4), silent = TRUE) if (!inherits(pls_fit, "try-error") && !is.null(pls_fit$Coefficients)) { risk_table$plsRcox <- as.numeric(as.matrix(X_test_raw) %*% pls_fit$Coefficients) } } risk_table apply( risk_table[-1], 2, function(lp) { survival::concordance(survival::Surv(Y_test, status_test) ~ lp)$concordance } ) } ``` # Next steps * `vignette("bigPLScox-overview", package = "bigPLScox")` summarises the main modelling functions and their big-memory counterparts. * `vignette("bigPLScox-benchmarking", package = "bigPLScox")` explains how to reproduce performance comparisons with the scripts under `inst/benchmarks/`. * The README and package website provide further references and data preparation tips for large-scale studies.