% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/AuxSurvey.R
\name{auxsurvey}
\alias{auxsurvey}
\title{Auxiliary Variables in Survey Analysis}
\usage{
auxsurvey(
  formula,
  auxiliary = NULL,
  samples,
  population = NULL,
  subset = NULL,
  family = gaussian(),
  method = c("sample_mean", "rake", "postStratify", "MRP", "GAMP", "linear", "BART"),
  weights = NULL,
  levels = c(0.95, 0.8, 0.5),
  stan_verbose = TRUE,
  nskip = 1000,
  npost = 1000,
  nchain = 4,
  HPD_interval = FALSE,
  seed = NULL
)
}
\arguments{
\item{formula}{A string or formula specifying the outcome model. For non-model-based methods
(e.g., sample mean, raking, post-stratification), only include the outcome variable (e.g., "~Y").
For model-based methods (e.g., MRP, GAMP, linear regression), additional fixed effect predictors can
be specified, such as "Y ~ X1 + X2 + I(X^2)". For GAMP, smooth functions can be specified as
"Y ~ X1 + s(X2, 10) + s(X3, by = X1)". Categorical variables are automatically treated as dummy variables
in model-based methods.}

\item{auxiliary}{A string specifying the formula for the auxiliary variables. For sample mean and
BART, this should be `NULL`. For raking, post-stratification, and GAMP, this should be an additive model
(e.g., "Z1 + Z2 + Z3"). For MRP, specify random effects for terms in this parameter, such as "Z1 + Z2 + Z3"
or "Z1 + Z2:Z3".}

\item{samples}{A dataframe or tibble containing all variables specified in `formula` and `auxiliary`.
This is typically a subset of the population.}

\item{population}{A dataframe or tibble containing all variables specified in `formula` and `auxiliary`.
This is the entire population used for estimation.}

\item{subset}{A character vector representing filtering conditions to select subsets of `samples` and `population`.
Default is `NULL`, in which case the analysis is performed on the entire dataset. If subsets are specified,
estimates for both the whole data and the subsets will be calculated.}

\item{family}{The distribution family of the outcome variable. Supported options are:
\code{\link[stats]{gaussian}} for continuous outcomes and \code{\link[stats]{binomial}} for binary outcomes.}

\item{method}{A string specifying the model to use. Options include "sample_mean", "rake", "postStratify",
"MRP", "GAMP", "linear", and "BART".}

\item{weights}{A numeric vector of case weights. The length should match the number of cases in `samples`.}

\item{levels}{A numeric vector specifying the confidence levels for the confidence intervals (CIs).
Multiple values can be specified to calculate multiple CIs.}

\item{stan_verbose}{A logical scalar; if `TRUE`, prints all messages when running Stan models. Default is `FALSE`.
This parameter only applies to Bayesian models.}

\item{nskip}{An integer specifying the number of burn-in iterations for each chain in MCMC for Stan models.
Default is `1000`. This parameter only applies to Bayesian models.}

\item{npost}{An integer specifying the number of posterior sampling iterations for each chain in MCMC for Stan models.
Default is `1000`. This parameter only applies to Bayesian models.}

\item{nchain}{An integer specifying the number of MCMC chains for Stan models. Default is `4`. This parameter
only applies to Bayesian models.}

\item{HPD_interval}{A logical scalar; if `TRUE`, calculates the highest posterior density (HPD) intervals for the
CIs of Stan models. Default is `FALSE`, in which case symmetric intervals are calculated. This parameter only applies
to Bayesian models.}

\item{seed}{An integer specifying the random seed for reproducibility. Default is `NULL`.}
}
\value{
A list containing the sample mean estimates and CIs for the subset and/or the whole dataset.
        Each element in the list includes:
        - `estimate`: The point estimate of the sample mean.
        - `CI`: Confidence intervals for the sample mean.
        - Other elements for each confidence level specified in `levels`.
}
\description{
This function provides a user-friendly interface for various estimators in survey analysis
when working with discretized auxiliary variables. Probability surveys often use continuous
data from administrative records as auxiliary variables, but the utility of this data is
diminished when discretized for confidentiality purposes. This package offers different estimators
that handle discretized auxiliary variables effectively.
}
\details{
The available estimators include:
- Weighted or unweighted sample mean
- Weighted or unweighted raking
- Weighted or unweighted post-stratification
- Bayesian methods:
  - BART (Bayesian Additive Regression Trees)
  - MRP (Multilevel Regression with Poststratification)
  - GAMP (Generalized Additive Model of Response Propensity)
  - Weighted linear regression

These Bayesian models are implemented using the **rstan** and **rstanarm** packages.
}
\examples{
## Simulate data with nonlinear association (setting 3).
data = simulate(N = 3000, discretize = 10, setting = 3, seed = 123)
population = data$population
samples = data$samples
ipw = 1 / samples$true_pi
true_mean = mean(population$Y1)

## IPW Sample Mean
IPW_sample_mean = auxsurvey("~Y1", auxiliary = NULL, weights = ipw,
                            samples = samples, population = population,
                            subset = c("Z1 == 1 & Z2 == 1"), method = "sample_mean",
                            levels = 0.95)

## Raking
rake = auxsurvey("~Y1", auxiliary = "Z1 + Z2 + Z3 + auX_10", samples = samples,
                 population = population, subset = c("Z1 == 1", "Z1 == 1 & Z2 == 1"),
                 method = "rake", levels = 0.95)

## MRP
MRP = auxsurvey("Y1 ~ 1 + Z1", auxiliary = "Z2 + Z3:auX_10", samples = samples,
                population = population, subset = c("Z1 == 1", "Z1 == 1 & Z2 == 1"),
                method = "MRP", levels = 0.95, nskip = 40, npost = 40,
                nchain = 1, stan_verbose = FALSE, HPD_interval = TRUE, seed = 123)

## GAMP
GAMP = auxsurvey("Y1 ~ 1 + Z1 + Z2 + Z3", auxiliary = "s(auX_10) + s(logit_true_pi, by = Z1)",
                 samples = samples, population = population, method = "GAMP",
                 levels = 0.95, nskip = 40, npost = 40, nchain = 1,
                 stan_verbose = FALSE, HPD_interval = TRUE, seed = 123)

## BART
BART = auxsurvey("Y1 ~ Z1 + Z2 + Z3 + auX_10", auxiliary = NULL, samples = samples,
                 population = population, method = "BART", levels = 0.95,
                 nskip = 40, npost = 40, nchain = 1, HPD_interval = TRUE, seed = 123)


}
