## ----include = FALSE---------------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>", out.width = "100%", fig.keep = "none", fig.show = "hide" ) ## ----setup, echo=TRUE, warning=FALSE, message=FALSE, cache=TRUE--------------- # Loading all packages required # Data manipulation and reshaping library(dplyr) # For data filtering, grouping, and summarising. library(tidyr) # For reshaping data (e.g., pivot_longer, pivot_wider). # Plotting and visualization library(ggplot2) # For creating all the ggplot-based visualizations. library(gridExtra) # For arranging multiple plots on a single page. library(ggrepel) # For improved label placement in plots (e.g., volcano plots). library(gplots) # For heatmap.2, which is used to generate heatmaps. library(plot3D) # For creating 3D scatter plots in PCA and sPLS-DA analyses. library(reshape2) # For data transformation (e.g., melt) in cross-validation plots. # Statistical analysis library(mixOmics) # For multivariate analyses (PCA, sPLS-DA, etc.). library(e1071) # For computing skewness and kurtosis. library(pROC) # For ROC curve generation in machine learning model evaluation. # Machine learning library(xgboost) # For building XGBoost classification models. library(randomForest) # For building Random Forest classification models. library(caret) # For cross-validation and other machine learning utilities. # Package development and document rendering library(knitr) # For knitting RMarkdown files and setting chunk options. library(devtools) # For installing the development version of the package from GitHub. # Load in the CytoProfile package library(CytoProfile) # Loading in data data("ExampleData1") data_df <- ExampleData1 # Creating a temporary directory to store output files path <- tempdir(check = TRUE) ## ----EDA 1, echo=TRUE, warning=FALSE, message=FALSE, cache=TRUE-------------- # Generating boxplots to check for outliers for raw values cyt_bp(data_df[, -c(1:3)], pdf_title = file.path(path, "boxplot_by_cytokine_raw.pdf")) # Removing the first 3 columns to retain only continuous variables. # Generating boxplots to check for outliers for log2 values cyt_bp(data_df[, -c(1:3)], pdf_title = file.path(path, "boxplot_by_cytokine_log2.pdf"), scale = "log2") ## ----EDA 2, echo=TRUE, warning=FALSE, message=FALSE, cache=TRUE--------------- # Raw values for group-specific boxplots cyt_bp2(data_df[, -c(3)], pdf_title = file.path(path, "boxplot_by_group_and_treatment_raw.pdf"), scale = NULL) # Log2-transformed group-specific boxplots cyt_bp2(data_df[, -c(3)], pdf_title = file.path(path, "boxplot_by_group_and_treatment_log2.pdf"), scale = "log2") ## ----EDA 3, echo=TRUE, warning=FALSE, message=FALSE, cache=TRUE--------------- # Histogram for overall raw data cyt_skku(data_df[, -c(1:3)], pdf_title = file.path(path, "skew_and_kurtosis.pdf"), group_cols = NULL) # Histogram with grouping (e.g., "Group") cyt_skku(ExampleData1[, -c(2:3)], pdf_title = file.path(path, "skew_and_kurtosis_2.pdf"), group_cols = c("Group")) ## ----EDA 4, echo=TRUE, warning=FALSE, message=FALSE, cache=TRUE-------------- cytokine_mat <- ExampleData1[, -c(1:3)] cytokineNames <- colnames(cytokine_mat) nCytokine <- length(cytokineNames) results <- cyt_skku(ExampleData1[, -c(3)], print_res_log = TRUE, group_cols = c("Group", "Treatment")) pdf(file = file.path(path, "bar_error_plot.pdf")) oldpar <- par(no.readonly = TRUE) par(mfrow = c(2,2), mar = c(4, 4, 2, 1)) for (k in 1:nCytokine) { center_df <- data.frame(name = rownames(results[,,k]), results[,,k]) cyt_errbp(center_df, p_lab = FALSE, es_lab = FALSE, class_symbol = TRUE, y_lab = "Concentration in log2 scale", main = cytokineNames[k]) } par(oldpar) if (dev.cur() > 1) dev.off() ## ----EDA 5, echo=TRUE, warning=FALSE, message=FALSE, cache=TRUE-------------- data_df <- ExampleData1[, -c(3)] cyt_mat <- log2(data_df[, -c(1:2)]) data_df1 <- data.frame(data_df[, 1:2], cyt_mat) cytokineNames <- colnames(cyt_mat) nCytokine <- length(cytokineNames) condt <- !is.na(cyt_mat) & (cyt_mat > 0) Cutoff <- min(cyt_mat[condt], na.rm = TRUE) / 10 p_aov_mat <- matrix(NA, nrow = nCytokine, ncol = 3) dimnames(p_aov_mat) <- list(cytokineNames, c("Group", "Treatment", "Interaction")) p_groupComp_mat <- matrix(NA, nrow = nCytokine, ncol = 3) dimnames(p_groupComp_mat) <- list(cytokineNames, c("2-1", "3-1", "3-2")) ssmd_groupComp_stm_mat <- mD_groupComp_stm_mat <- p_groupComp_stm_mat <- p_groupComp_mat for (i in 1:nCytokine) { Cytokine <- (cyt_mat[, i] + Cutoff) cytokine_aov <- aov(Cytokine ~ Group * Treatment, data = data_df) aov_table <- summary(cytokine_aov)[[1]] p_aov_mat[i, ] <- aov_table[1:3, 5] p_groupComp_mat[i, ] <- TukeyHSD(cytokine_aov)$Group[1:3, 4] p_groupComp_stm_mat[i, ] <- TukeyHSD(cytokine_aov)$`Group:Treatment`[1:3, 4] mD_groupComp_stm_mat[i, ] <- TukeyHSD(cytokine_aov)$`Group:Treatment`[1:3, 1] ssmd_groupComp_stm_mat[i, ] <- mD_groupComp_stm_mat[i, ] / sqrt(2 * aov_table["Residuals", "Mean Sq"]) } results <- cyt_skku(ExampleData1[, -c(3)], print_res_log = TRUE, group_cols = c("Group", "Treatment")) pdf(file = file.path(path, "bar_error_plot_enriched.pdf")) oldpar <- par(no.readonly = TRUE) par(mfrow = c(2,2), mar = c(3, 3, 2, 1)) for (k in 1:nCytokine) { result_mat <- results[1:9, , k] center_df <- data.frame( name = rownames(result_mat), result_mat[, c("center", "spread")], p.value = c(1, p_groupComp_stm_mat[k, 1:2]), effect.size = c(0, ssmd_groupComp_stm_mat[k, 1:2]) ) cyt_errbp(center_df, p_lab = TRUE, es_lab = TRUE, class_symbol = TRUE, y_lab = "Concentration in log2 scale", main = cytokineNames[k]) } par(oldpar) if (dev.cur() > 1) dev.off() ## ----Univariate 1, echo=TRUE, warning=FALSE, message=FALSE, cache=TRUE------- # Performing Two Sample T-test and Mann Whitney U Test data_df <- ExampleData1[, -c(3)] data_df <- dplyr::filter(data_df, Group != "ND", Treatment != "Unstimulated") # Two sample T-test cyt_ttest(data_df[, c(1:2, 5:6)], scale = "log2", verbose = TRUE, format_output = TRUE) # Mann-Whitney U Test cyt_ttest(data_df[, c(1:2, 5:6)], verbose = TRUE) ## ----Univariate 2, echo=TRUE, warning=FALSE, message=FALSE, cache=TRUE------- # Perform ANOVA comparisons test (example with 2 cytokines) cyt_anova(data_df[, c(1:2, 5:6)], format_output = TRUE) ## ----Multivariate 1, echo=TRUE, warning=FALSE, message=FALSE, cache=TRUE------ # cyt.plsda function. data <- ExampleData1[, -c(3)] data_df <- dplyr::filter(data, Group != "ND" & Treatment == "CD3/CD28") cyt_splsda(data_df, pdf_title = file.path(path, "example_spls_da_analysis.pdf"), colors = c("black", "purple"), bg = FALSE, scale = "log2", ellipse = TRUE, conf_mat = FALSE, var_num = 25, cv_opt = "loocv", comp_num = 2, pch_values = c(16, 4), group_col = "Group", group_col2 = "Treatment", roc = TRUE) ## ----Multivariate 2, echo=TRUE, warning=FALSE, message=FALSE, cache=TRUE------ data <- ExampleData1[, -c(3,23)] data_df <- dplyr::filter(data, Group != "ND" & Treatment != "Unstimulated") cyt_pca(data_df, pdf_title = file.path(path, "example_pca_analysis.pdf"), colors = c("black", "red2"), scale = "log2", comp_num = 3, pch_values = c(16, 4), style = "3D", group_col = "Group", group_col2 = "Treatment") cyt_pca(data_df, pdf_title = file.path(path, "example_pca_analysis_2.pdf"), colors = c("black", "red2"), scale = "log2", comp_num = 2, pch_values = c(16, 4), group_col = "Group") ## ----EDA 6, echo=TRUE, warning=FALSE, message=FALSE, cache=TRUE--------------- # Generating Volcano Plot data_df <- ExampleData1[, -c(2:3)] cyt_volc(data_df, group_col = "Group", cond1 = "T2D", cond2 = "ND", fold_change_thresh = 2.0, top_labels = 15) ## ----EDA 7, echo=TRUE, warning=FALSE, message=FALSE, cache=TRUE, fig.width=5, fig.height=4---- # Generating Heat map cyt_heatmap(data = data_df, scale = "log2", # Optional scaling annotation_col_name = "Group", title = NULL) ## ----EDA 8, echo=TRUE, warning=FALSE, message=FALSE, cache=TRUE--------------- # Generating dual flashlights plot data_df <- ExampleData1[, -c(2:3)] dfp <- cyt_dualflashplot(data_df, group_var = "Group", group1 = "T2D", group2 = "ND", ssmd_thresh = -0.2, log2fc_thresh = 1, top_labels = 10) # Print the plot dfp # Print the table data used for plotting print(dfp$data) ## ----ML1, echo=TRUE, warning=FALSE, message=FALSE, cache=TRUE----------------- # Using XGBoost for classification data_df0 <- ExampleData1 data_df <- data.frame(data_df0[, 1:3], log2(data_df0[, -c(1:3)])) data_df <- data_df[, -c(2:3)] data_df <- dplyr::filter(data_df, Group != "ND") cyt_xgb(data = data_df, group_col = "Group", nrounds = 500, max_depth = 4, eta = 0.05, nfold = 5, cv = TRUE, eval_metric = "mlogloss", early_stopping_rounds = NULL, top_n_features = 10, verbose = 0, plot_roc = TRUE, print_results = FALSE) ## ----ML2, echo=TRUE, warning=FALSE, message=FALSE, cache=TRUE----------------- # Using Random Forest for classification cyt_rf(data = data_df, group_col = "Group", k_folds = 5, ntree = 1000, mtry = 4, run_rfcv = TRUE, plot_roc = TRUE, verbose = FALSE)