% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/onehot2sql.R
\name{onehot2sql}
\alias{onehot2sql}
\title{Prepare training data in R so that it is ready for XGBoost model fitting.
Meta information is stored so the exact transformation can be applied to any new data.
It also outputs SQL query performing the exact one-hot encoding for in-database data preparation.}
\usage{
onehot2sql(data, meta = NULL, sep = "_", ws_replace = TRUE,
  ws_replace_with = "", unique_id = NULL, output_file_name = NULL,
  input_table_name = NULL)
}
\arguments{
\item{data}{Data object of class \code{data.frame} or \code{data.table}.}

\item{meta}{Optional, a list keeps track of all the transformation that has been taken on the categorical features.}

\item{sep}{Separation symbol between the categorical features and their levels, which will be the column names inside the output \code{model.matrix}, default to "_".}

\item{ws_replace}{Boolean indicator controls whether white-space and punctuation inside categorical feature levels should be replaced, default to TRUE.}

\item{ws_replace_with}{Replacing symbol, default to '' which means all white-space and punctuation should be removed.}

\item{unique_id}{A row unique identifier is crucial for in-database scoring of XGBoost model. If not given, SQL query will be generated with id name "ROW_KEY".}

\item{output_file_name}{Optional, a file name that the SQL query will write to.}

\item{input_table_name}{Name of raw data table in the database, that the SQL query will select from. If not given, SQL query will be generated with table name "INPUT_TABLE".}
}
\value{
A list of 1). \code{meta} data tracking the transformation;
                  2). matrix \code{model.matrix} is the data after processing which is ready for XGBoost fitting;
                  3). SQL query \code{sql} performing the exact one-hot encoding in the database.
}
\description{
This function performs full one-hot encoding for all the categorical features inside the training data,
with all NAs inside both categorical and numeric features preserved.
Other than outputting a matrix \code{model.matrix} which is the data after processing,
it also outputs \code{meta} information keeping track of all the transformation the function performs,
while SQL query for the transformation is kept in output \code{sql} and write to the file specified by \code{output_file_name}.
If \code{meta} is specified as input to the function, the transformation and the corresponding SQL query will
follow what is kept in \code{meta} exactly.
}
\examples{
library(data.table)
### load test data
df = data.frame(ggplot2::diamonds)
head(df)

d1 = data.frame(ggplot2::diamonds)
d1[1,2] = NA  # NA on 1st row cut
d1[2,5] = NA  # NA on 2nd row depth
head(d1)

d2 = data.table(ggplot2::diamonds)
d2[, cut:=factor(cut, ordered=FALSE)]
d2[, clarity:=as.character(clarity)]
d2[, tsdt:=as.IDate('2017-01-05')]
d2[1:3, tsdt:=tsdt-1]
head(d2)

### out is obtained for training data
out <- onehot2sql(df)
out1 <- onehot2sql(d1)  # NA is kept in the output
out2 <- onehot2sql(d2)  # all non-numeric features will be treated as categorical

### perform same transformation for new data when meta is given
# test-1: new data has column class change
newdata = df[1:5,]
newdata$cut = as.character(newdata$cut)
onehot2sql(newdata, meta=out$meta)$model.matrix

# test-2: new data has NA
newdata = df[1:5,]
newdata[1,1]=NA; newdata[2,1]=NA; newdata[3,2]=NA; newdata[3,3]=NA; newdata[5,4]=NA
onehot2sql(newdata, meta=out$meta)$model.matrix

# test-3: newdata has column with new elements
newdata = d2[1:5,]
newdata[5,clarity:='NEW']; newdata[1,tsdt:=as.IDate('2017-05-01')]
onehot2sql(newdata, meta=out2$meta)$model.matrix

# test-4: newdata has new columns
newdata = d2[1:5,]
newdata[,new_col:=1]
onehot2sql(newdata, meta=out2$meta)$model.matrix

# test-5: newdata is lacking some columns
newdata = d2[1:5,]
newdata[,cut:=NULL]
onehot2sql(newdata, meta=out2$meta)$model.matrix
}
