## ---- include = FALSE---------------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

## ----eval=FALSE---------------------------------------------------------------
#  install.packages("OTrecod")

## ----results='hide',message=FALSE,warning=FALSE-------------------------------
library(OTrecod)

## ----eval=FALSE---------------------------------------------------------------
#  # Install development version from GitHub
#  devtools::install_github("otrecoding/OTrecod")

## ----results='hide',message=FALSE,warning=FALSE-------------------------------
library(StatMatch)
data(samp.A)

## ----eval=TRUE----------------------------------------------------------------
dim(samp.A)
head(samp.A)
table(samp.A$c.neti)   # Repartition of c.neti in the sample

## ----eval=TRUE----------------------------------------------------------------
c.neti            = as.numeric(samp.A$c.neti)
samp.A$c.neti.bis = as.factor(ifelse(c.neti %in% c(1,2),1, 
                              ifelse(c.neti %in% c(3,4),2, 
                              ifelse(c.neti %in% c(5,6),3,4)))) 
data1 = samp.A[1:200,c(2:3,5,7:9,12:13)]
colnames(data1)[4] = "age" 
head(data1)
data2 = samp.A[201:350,c(3,5:6,8:11,13:14)]
head(data2)

## ----eval=TRUE----------------------------------------------------------------
table(data1$c.neti)        # 7 levels in data1

table(data2$c.neti.bis)    # 4 levels in data2

colnames(data1)

colnames(data2)

intersect(colnames(data1), colnames(data2))   # the susbet of a priori shared variables

## ----eval=TRUE----------------------------------------------------------------
db_test  = merge_dbs(data1, data2, NAME_Y = "c.neti", NAME_Z = "c.neti.bis",
                     ordinal_DB1 = c(2,3,4,7), ordinal_DB2 = c(1:2,6,9))

summary(db_test)

db_test$REMAINING_VAR

db_test$REMOVE1

db_test$REMOVE2

db_test$ID1_drop; db_test$ID2_drop

db_test$DB_READY[c(1:5,201:205),]   # The 5 1st subjects of the two databases 

## ---- eval=TRUE---------------------------------------------------------------
# for data1
test_DB1 = select_pred(db_test$DB_READY,Y = "Y", Z = "Z", ID = 1, OUT = "Y",
                       quanti = 8, nominal = c(1,5:6,7), ordinal = c(2:4),
                       convert_num = 8, convert_class = 4,
                       thresh_cat = 0.30, thresh_num = 0.70, thresh_Y = 0.10,
                       RF = TRUE, RF_SEED = 3017)

# for data2
test_DB2 = select_pred(db_test$DB_READY,Y = "Y", Z = "Z", ID = 1, OUT = "Z",
                       quanti = 8, nominal = c(1,5:6,7), ordinal = c(2:4),
                       convert_num = 8, convert_class = 4,
                       thresh_cat = 0.30, thresh_num = 0.70, thresh_Y = 0.10,
                       RF = TRUE, RF_SEED = 3017)

## ----eval=TRUE----------------------------------------------------------------
summary(test_DB1)

test_DB1$vcrm_OUTC_cat

test_DB1$collinear_PB

# Results from RF
test_DB1$drop_var

test_DB1$RF_PRED

## -----------------------------------------------------------------------------
summary(test_DB2)

test_DB2$vcrm_OUTC_cat

test_DB2$collinear_PB

# Results from RF
test_DB2$drop_var

test_DB2$RF_PRED

## -----------------------------------------------------------------------------
match_var = db_test$DB_READY[,-c(5,8)]
match_var[c(1:5,201:205),]

## -----------------------------------------------------------------------------
# sequential algorithm
mod1_seq = OT_outcome(match_var, nominal = c(1,5:6), ordinal = 2:4, dist.choice = "E",
                      maxrelax = 0 , indiv.method = "sequential", which.DB = "A")
summary(mod1_seq)

# optimal algorithm with no relaxation term
mod2_opt = OT_outcome(match_var, nominal = c(1,5:6), ordinal = 2:4, dist.choice = "E",
                      maxrelax = 0, indiv.method = "optimal", which.DB = "A")
head(mod2_opt$profile)

dim(mod2_opt$profile)

mod2_opt$gamma_A

head(mod2_opt$DATA1_OT)


## ----joint--------------------------------------------------------------------
# Algorithms with no enrichments
mod3_joint = OT_joint(match_var, nominal = c(1,5), ordinal = c(2:4,6), dist.choice = "E", 
                      prox.X = 0.10, which.DB = "A")
summary(mod3_joint)

## -----------------------------------------------------------------------------
# Validation of the mod1_seq model
verif_out1 = verif_OT(mod1_seq, stab.prob = TRUE, min.neigb = 3)
verif_out1$conf.mat

verif_out1$res.prox

verif_out1$res.stab


# Validation of the mod2_seq model
verif_out2 = verif_OT(mod2_opt, stab.prob = TRUE, min.neigb = 3)
verif_out2$conf.mat
rate_good_pred = (37+40+31+45+18+13+9)/200
rate_good_pred

verif_out2$res.prox

verif_out2$res.stab


# Validation of the mod3_opt model
verif_jt   = verif_OT(mod3_joint, stab.prob = TRUE, min.neigb = 3)
verif_jt$conf.mat

verif_jt$res.prox

verif_jt$res.stab