--- title: "Analyse your data" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{Analyse your data} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- When you start to use Armadillo as a backend for DataSHIELD you can use the `DSMolgenisArmadillo` package for research purposes. There is a default workflow in DataSHIELD to do analysis. These are the steps that you need to take: ### Authenticate First obtain a token from the authentication server to authenticate in DataSHIELD. ```r # Load the necessary packages. library(dsBaseClient) library(DSMolgenisArmadillo) # specify server url armadillo_url <- "https://armadillo-demo.molgenis.net" # get token from central authentication server token <- armadillo.get_token(armadillo_url) #> [1] "We're opening a browser so you can log in with code 5FW3FV" ``` Then build a login dataframe and perform the login on the Armadillo server. The important part is to specify the driver. This should be `ArmadilloDriver`. ```r # build the login dataframe builder <- DSI::newDSLoginBuilder() builder$append( server = "armadillo", url = armadillo_url, token = token, driver = "ArmadilloDriver", profile = "xenon", ) # create loginframe login_data <- builder$build() # login into server conns <- DSI::datashield.login(logins = login_data, assign = FALSE) ``` > You can append multiple servers to the login frame to perform an analysis across multiple cohorts. ### Assign data To work with DataSHIELD you need to be able to query data. You can do this by assigning data in the Armadillo service. #### Assign data using expressions You can assign values from expressions to symbols. ```r # assign some data to 'K' datashield.assign.expr(conns = conns, symbol = "K", "c(10,50,100)") ``` ```r # calculate the mean of 'K' to see that the assignment has worked ds.mean("K", datasources = conns) #> $Mean.by.Study #> EstimatedMean Nmissing Nvalid Ntotal #> armadillo 53.33333 0 3 3 #> #> $Nstudies #> [1] 1 #> #> $ValidityMessage #> ValidityMessage #> armadillo "VALID ANALYSIS" ``` #### Assign data from tables You can check which tables (`data.frame`'s) are available on the Armadillo. ```r datashield.tables(conns) #> $armadillo #> [1] "study1/2_1-core-1_0/nonrep" "study1/2_1-core-1_0/yearlyrep" #> [3] "study1/1_1-outcome-1_0/yearlyrep" "gecko/2_1-core-1_0/trimesterrep" #> [5] "gecko/2_1-core-1_0/nonrep" "gecko/2_1-core-1_0/yearlyrep" #> [7] "gecko/2_1-core-1_0/monthlyrep" "gecko/1_1-outcome-1_0/nonrep" #> [9] "gecko/1_1-outcome-1_0/yearlyrep" "test/data/LT-example-dataset_long-format" #> [11] "test/data/d" "trajectories/data/alspac" #> [13] "trajectories/data/chs" "trajectories/data/bib" #> [15] "trajectories/data/bcg" "trajectories/data/d" #> [17] "trajectories/data/probit" "inma/1_2_urban_ath_1_0/yearly_rep" #> [19] "inma/1_2_urban_ath_1_0/trimester_rep" "inma/1_2_urban_ath_1_0/non_rep" #> [21] "inma/1_1_outcome_ath_1_0/trimester_rep" "inma/1_1_outcome_ath_1_0/non_rep" #> [23] "inma/1_0_outcome_ath_1_0/trimester_rep" "inma/1_0_outcome_ath_1_0/non_rep" #> [25] "longitools/testparquet/LT_example_data" "longitools/mydata/nonrep" ``` And load data from one of these tables. ```r # assign table data to a symbol datashield.assign.table( conns = conns, table = "gecko/2_1-core-1_0/nonrep", symbol = "core_nonrep" ) ``` ```r # check the columns in the non-repeated data ds.colnames("core_nonrep", datasources = conns) #> $armadillo #> [1] "row_id" "child_id" "mother_id" "cohort_id" "preg_no" #> [6] "child_no" "coh_country" "recruit_age" "cob_m" "ethn1_m" #> [11] "ethn2_m" "ethn3_m" "agebirth_m_y" "agebirth_m_d" "death_m" #> [16] "death_m_age" "prepreg_weight" "prepreg_weight_mes" "prepreg_weight_ga" "latepreg_weight" #> [21] "latepreg_weight_mes" "latepreg_weight_ga" "preg_gain" "preg_gain_mes" "height_m" #> [26] "height_mes_m" "prepreg_dia" "preg_dia" "preg_thyroid" "preg_fever" #> [31] "preeclam" "preg_ht" "asthma_m" "prepreg_psych" "preg_psych" #> [36] "ppd" "prepreg_smk" "prepreg_cig" "preg_smk" "preg_cig" #> [41] "prepreg_alc" "prepreg_alc_unit" "preg_alc" "preg_alc_unit" "folic_prepreg" #> [46] "folic_preg12" "folic_post12" "parity_m" "preg_plan" "mar" #> [51] "ivf" "outcome" "mode_delivery" "plac_abrup" "cob_p" #> [56] "cob_p_fath" "ethn1_p" "ethn2_p" "ethn3_p" "ethn_p_fath" #> [61] "agebirth_p_y" "agebirth_p_d" "agebirth_p_fath" "death_p" "death_p_age" #> [66] "death_p_fath" "weight_f1" "weight_mes_f1" "weight_f1_fath" "height_f1" #> [71] "height_mes_f1" "height_f1_fath" "dia_bf" "asthma_bf" "psych_bf" #> [76] "smk_p" "smk_cig_p" "smk_fath" "birth_month" "birth_year" #> [81] "apgar" "neo_unit" "sex" "plurality" "ga_lmp" #> [86] "ga_us" "ga_mr" "ga_bj" "birth_weight" "birth_length" #> [91] "birth_head_circum" "weight_who_ga" "plac_weight" "con_anomalies" "major_con_anomalies" #> [96] "cer_palsy" "sibling_pos" "death_child" "death_child_age" "breastfed_excl" #> [101] "breastfed_any" "breastfed_ever" "solid_food" "childcare_intro" "cats_preg" #> [106] "dogs_preg" "cats_quant_preg" "dogs_quant_preg" ``` #### Assign data at login time You can also specify a table in the login frame and assign the data when you login. ```r # build the login dataframe builder <- DSI::newDSLoginBuilder() builder$append( server = "armadillo", url = armadillo_url, token = token, driver = "ArmadilloDriver", table = "gecko/2_1-core-1_0/nonrep", profile = "xenon", ) # create loginframe login_data <- builder$build() # login into server conns <- DSI::datashield.login(logins = login_data, assign = TRUE, symbol="core_nonrep") ``` ### Subsetting data Before you are working with the data you can subset to a specific range of variables you want to use in the set. ```r # assign the repeated data to reshape datashield.assign.table( conns = conns, table = "gecko/2_1-core-1_0/yearlyrep", symbol = "core_yearlyrep" ) # check dimensions of repeatead measures ds.dim("core_yearlyrep", datasources = conns) #> $`dimensions of core_yearlyrep in armadillo` #> [1] 19000 34 #> #> $`dimensions of core_yearlyrep in combined studies` #> [1] 19000 34 # subset the data to the first 2 years ds.dataFrameSubset( df.name = "core_yearlyrep", newobj = "core_yearlyrep_1_3", V1.name = "core_yearlyrep$age_years", V2.name = "2", Boolean.operator = "<=" ) #> $is.object.created #> [1] "A data object has been created in all specified data sources" #> #> $validity.check #> [1] " appears valid in all sources" # check the columns ds.colnames("core_yearlyrep_1_3", datasources = conns) #> $armadillo #> [1] "row_id" "child_id" "age_years" "cohab_" "occup_m_" #> [6] "occupcode_m_" "edu_m_" "occup_f1_" "occup_f1_fath" "occup_f2_" #> [11] "occup_f2_fath" "occupcode_f1_" "occupcode_f1_fath" "occupcode_f2_" "occupcode_f2_fath" #> [16] "edu_f1_" "edu_f1_fath" "edu_f2_" "edu_f2_fath" "childcare_" #> [21] "childcarerel_" "childcareprof_" "childcarecentre_" "smk_exp" "pets_" #> [26] "cats_" "cats_quant_" "dogs_" "dogs_quant_" "mental_exp" #> [31] "hhincome_" "fam_splitup" "famsize_child" "famsize_adult" # check dimensions again ds.dim("core_yearlyrep_1_3", datasources = conns) #> $`dimensions of core_yearlyrep_1_3 in armadillo` #> [1] 3000 34 #> #> $`dimensions of core_yearlyrep_1_3 in combined studies` #> [1] 3000 34 ``` ```r # strip the redundant columns ds.dataFrame( x = c("core_yearlyrep_1_3$child_id", "core_yearlyrep_1_3$age_years", "core_yearlyrep_1_3$dogs_", "core_yearlyrep_1_3$cats_", "core_yearlyrep_1_3$pets_"), completeCases = TRUE, newobj = "core_yearlyrep_1_3_stripped", datasources = conns ) #> $is.object.created #> [1] "A data object has been created in all specified data sources" #> #> $validity.check #> [1] " appears valid in all sources" ``` ### Transform data In general you need 2 methods to work with data that is stored in long format, the `reshape` and `merge` functions in DataSHIELD. You can reshape data with the Armadillo to transform data from [wide-format to long-format](https://www.theanalysisfactor.com/wide-and-long-data/) and vice versa. You can do this using the `ds.reshape` function: ```r # reshape the data for the wide-format variables (yearlyrep) ds.reShape( data.name = "core_yearlyrep_1_3_stripped", timevar.name = "age_years", idvar.name = "child_id", v.names = c("pets_", "cats_", "dogs_"), direction = "wide", newobj = "core_yearlyrep_1_3_wide", datasources = conns ) #> $is.object.created #> [1] "A data object has been created in all specified data sources" #> #> $validity.check #> [1] " appears valid in all sources" ``` ```r # show the reshaped columns of the new data frame ds.colnames("core_yearlyrep_1_3_wide", datasources = conns) #> $armadillo #> [1] "child_id" "pets_.0" "cats_.0" "dogs_.0" "pets_.1" "cats_.1" "dogs_.1" "pets_.2" "cats_.2" "dogs_.2" ``` When you reshaped and subsetted the data you often need to merge your dataframe with others to get your analysis dataframe. You can do this using the `ds.merge` function: ```r # merge non-repeated table with wide-format repeated table # make sure the disclosure measure regarding stringshort is set to '100' ds.merge( x.name = "core_nonrep", y.name = "core_yearlyrep_1_3_wide", by.x.names = "child_id", by.y.names = "child_id", newobj = "analysis_df", datasources = conns ) #> $is.object.created #> [1] "A data object has been created in all specified data sources" #> #> $validity.check #> [1] " appears valid in all sources" ``` ```r ds.colnames("analysis_df", datasources = conns) #> $armadillo #> [1] "child_id" "row_id" "mother_id" "cohort_id" "preg_no" #> [6] "child_no" "coh_country" "recruit_age" "cob_m" "ethn1_m" #> [11] "ethn2_m" "ethn3_m" "agebirth_m_y" "agebirth_m_d" "death_m" #> [16] "death_m_age" "prepreg_weight" "prepreg_weight_mes" "prepreg_weight_ga" "latepreg_weight" #> [21] "latepreg_weight_mes" "latepreg_weight_ga" "preg_gain" "preg_gain_mes" "height_m" #> [26] "height_mes_m" "prepreg_dia" "preg_dia" "preg_thyroid" "preg_fever" #> [31] "preeclam" "preg_ht" "asthma_m" "prepreg_psych" "preg_psych" #> [36] "ppd" "prepreg_smk" "prepreg_cig" "preg_smk" "preg_cig" #> [41] "prepreg_alc" "prepreg_alc_unit" "preg_alc" "preg_alc_unit" "folic_prepreg" #> [46] "folic_preg12" "folic_post12" "parity_m" "preg_plan" "mar" #> [51] "ivf" "outcome" "mode_delivery" "plac_abrup" "cob_p" #> [56] "cob_p_fath" "ethn1_p" "ethn2_p" "ethn3_p" "ethn_p_fath" #> [61] "agebirth_p_y" "agebirth_p_d" "agebirth_p_fath" "death_p" "death_p_age" #> [66] "death_p_fath" "weight_f1" "weight_mes_f1" "weight_f1_fath" "height_f1" #> [71] "height_mes_f1" "height_f1_fath" "dia_bf" "asthma_bf" "psych_bf" #> [76] "smk_p" "smk_cig_p" "smk_fath" "birth_month" "birth_year" #> [81] "apgar" "neo_unit" "sex" "plurality" "ga_lmp" #> [86] "ga_us" "ga_mr" "ga_bj" "birth_weight" "birth_length" #> [91] "birth_head_circum" "weight_who_ga" "plac_weight" "con_anomalies" "major_con_anomalies" #> [96] "cer_palsy" "sibling_pos" "death_child" "death_child_age" "breastfed_excl" #> [101] "breastfed_any" "breastfed_ever" "solid_food" "childcare_intro" "cats_preg" #> [106] "dogs_preg" "cats_quant_preg" "dogs_quant_preg" "pets_.0" "cats_.0" #> [111] "dogs_.0" "pets_.1" "cats_.1" "dogs_.1" "pets_.2" #> [116] "cats_.2" "dogs_.2" ``` ### Save your work When you finished building your analysis frame you can save it using [workspaces](workspaces.html). ### Performing analysis There are a variety of analysis you can perform in DataSHIELD. You can perform basic methods such as summary statistics and more advanced methods such as GLM. #### Simple statistical methods You execute a summary on the a variable within you analysis frame. It will return summary statistics. ```r ds.summary("analysis_df$pets_.1", datasources = conns) #> $armadillo #> $armadillo$class #> [1] "numeric" #> #> $armadillo$length #> [1] 1000 #> #> $armadillo$`quantiles & mean` #> 5% 10% 25% 50% 75% 90% 95% Mean #> 8.000 15.000 32.750 61.000 90.000 108.000 113.000 60.954 ``` #### Advanced statistical methods When you finished the analysis dataframe, you can perform the actual analysis. You can use a wide variety of functions. The example below is showing the `glm`. ```r datashield.assign.table( conns = conns, table = "gecko/1_1-outcome-1_0/nonrep", symbol = "outcome_nonrep" ) armadillo_glm <- ds.glm( formula = "asthma_ever_CHICOS~pets_preg", data = "outcome_nonrep", family = "binomial", datasources = conns ) ``` Do the meta analysis and install prerequisites. ```r if (!require('metafor')) install.packages('metafor') ``` ```r yi <- c(armadillo_glm$coefficients["pets_preg", "Estimate"]) sei <- c(armadillo_glm$coefficients["pets_preg", "Std. Error"]) res <- metafor::rma(yi, sei = sei) res #> #> Random-Effects Model (k = 1; tau^2 estimator: REML) #> #> tau^2 (estimated amount of total heterogeneity): 0 #> tau (square root of estimated tau^2 value): 0 #> I^2 (total heterogeneity / total variability): 0.00% #> H^2 (total variability / sampling variability): 1.00 #> #> Test for Heterogeneity: #> Q(df = 0) = 0.0000, p-val = 1.0000 #> #> Model Results: #> #> estimate se zval pval ci.lb ci.ub #> -0.1310 0.1267 -1.0343 0.3010 -0.3793 0.1173 #> #> --- #> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 metafor::forest(res, xlab = "OR", transf = exp, refline = 1, slab = c("armadillo_glm")) ```
plot of chunk meta-analysis

plot of chunk meta-analysis

#### Creating figures You can directly create figures with the DataSHIELD methods. For example: ```r # create histogram ds.histogram(x = "core_nonrep$coh_country", datasources = conns) ```
plot of chunk create-a-histogram

plot of chunk create-a-histogram

``` #> $breaks #> [1] 35.31138 116.38319 197.45500 278.52680 359.59861 440.67042 521.74222 602.81403 683.88584 764.95764 846.02945 #> #> $counts #> [1] 106 101 92 103 106 104 105 101 113 69 #> #> $density #> [1] 0.0013074829 0.0012458092 0.0011347965 0.0012704787 0.0013074829 0.0012828134 0.0012951481 0.0012458092 0.0013938261 #> [10] 0.0008510974 #> #> $mids #> [1] 75.84729 156.91909 237.99090 319.06271 400.13451 481.20632 562.27813 643.34993 724.42174 805.49355 #> #> $xname #> [1] "xvect" #> #> $equidist #> [1] TRUE #> #> attr(,"class") #> [1] "histogram" ``` ```r # create a heatmap ds.heatmapPlot(x = "analysis_df$pets_.1", y = "analysis_df$dogs_.1", datasources = conns) ```
plot of chunk create-a-heatmap

plot of chunk create-a-heatmap

```r # logout datashield.logout(conns) ```