% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/analyze_variables.R
\name{analyze_variables}
\alias{analyze_variables}
\alias{analyze_vars}
\alias{s_summary}
\alias{s_summary.numeric}
\alias{s_summary.factor}
\alias{s_summary.character}
\alias{s_summary.logical}
\alias{a_summary}
\title{Analyze variables}
\usage{
analyze_vars(
  lyt,
  vars,
  var_labels = vars,
  na_str = default_na_str(),
  nested = TRUE,
  ...,
  na.rm = TRUE,
  show_labels = "default",
  table_names = vars,
  section_div = NA_character_,
  .stats = c("n", "mean_sd", "median", "range", "count_fraction"),
  .formats = NULL,
  .labels = NULL,
  .indent_mods = NULL
)

s_summary(x, na.rm = TRUE, denom, .N_row, .N_col, .var, ...)

\method{s_summary}{numeric}(
  x,
  na.rm = TRUE,
  denom,
  .N_row,
  .N_col,
  .var,
  control = control_analyze_vars(),
  ...
)

\method{s_summary}{factor}(
  x,
  na.rm = TRUE,
  denom = c("n", "N_row", "N_col"),
  .N_row,
  .N_col,
  ...
)

\method{s_summary}{character}(
  x,
  na.rm = TRUE,
  denom = c("n", "N_row", "N_col"),
  .N_row,
  .N_col,
  .var,
  verbose = TRUE,
  ...
)

\method{s_summary}{logical}(
  x,
  na.rm = TRUE,
  denom = c("n", "N_row", "N_col"),
  .N_row,
  .N_col,
  ...
)

a_summary(
  x,
  .N_col,
  .N_row,
  .var = NULL,
  .df_row = NULL,
  .ref_group = NULL,
  .in_ref_col = FALSE,
  compare = FALSE,
  .stats = NULL,
  .formats = NULL,
  .labels = NULL,
  .indent_mods = NULL,
  na.rm = TRUE,
  na_str = default_na_str(),
  ...
)
}
\arguments{
\item{lyt}{(\code{PreDataTableLayouts})\cr layout that analyses will be added to.}

\item{vars}{(\code{character})\cr variable names for the primary analysis variable to be iterated over.}

\item{var_labels}{(\code{character})\cr variable labels.}

\item{na_str}{(\code{string})\cr string used to replace all \code{NA} or empty values in the output.}

\item{nested}{(\code{flag})\cr whether this layout instruction should be applied within the existing layout structure _if
possible (\code{TRUE}, the default) or as a new top-level element (\code{FALSE}). Ignored if it would nest a split.
underneath analyses, which is not allowed.}

\item{...}{arguments passed to \code{s_summary()}.}

\item{na.rm}{(\code{flag})\cr whether \code{NA} values should be removed from \code{x} prior to analysis.}

\item{show_labels}{(\code{string})\cr label visibility: one of "default", "visible" and "hidden".}

\item{table_names}{(\code{character})\cr this can be customized in the case that the same \code{vars} are analyzed multiple
times, to avoid warnings from \code{rtables}.}

\item{section_div}{(\code{string})\cr string which should be repeated as a section divider after each group
defined by this split instruction, or \code{NA_character_} (the default) for no section divider.}

\item{.stats}{(\code{character})\cr statistics to select for the table. Run \code{get_stats("analyze_vars_numeric")} to see
statistics available for numeric variables, and \code{get_stats("analyze_vars_counts")} for statistics available
for non-numeric variables.}

\item{.formats}{(named \code{character} or \code{list})\cr formats for the statistics. See Details in \code{analyze_vars} for more
information on the \code{"auto"} setting.}

\item{.labels}{(named \code{character})\cr labels for the statistics (without indent).}

\item{.indent_mods}{(named \code{integer})\cr indent modifiers for the labels. Each element of the vector
should be a name-value pair with name corresponding to a statistic specified in \code{.stats} and value the indentation
for that statistic's row label.}

\item{x}{(\code{numeric})\cr vector of numbers we want to analyze.}

\item{denom}{(\code{string})\cr choice of denominator for proportion. Options are:
\itemize{
\item \code{n}: number of values in this row and column intersection.
\item \code{N_row}: total number of values in this row across columns.
\item \code{N_col}: total number of values in this column across rows.
}}

\item{.N_row}{(\code{integer(1)})\cr row-wise N (row group count) for the group of observations being analyzed
(i.e. with no column-based subsetting) that is typically passed by \code{rtables}.}

\item{.N_col}{(\code{integer(1)})\cr column-wise N (column count) for the full column being analyzed that is typically
passed by \code{rtables}.}

\item{.var}{(\code{string})\cr single variable name that is passed by \code{rtables} when requested
by a statistics function.}

\item{control}{(\code{list})\cr parameters for descriptive statistics details, specified by using
the helper function \code{\link[=control_analyze_vars]{control_analyze_vars()}}. Some possible parameter options are:
\itemize{
\item \code{conf_level} (\code{proportion})\cr confidence level of the interval for mean and median.
\item \code{quantiles} (\code{numeric(2)})\cr vector of length two to specify the quantiles.
\item \code{quantile_type} (\code{numeric(1)})\cr between 1 and 9 selecting quantile algorithms to be used.
See more about \code{type} in \code{\link[stats:quantile]{stats::quantile()}}.
\item \code{test_mean} (\code{numeric(1)})\cr value to test against the mean under the null hypothesis when calculating p-value.
}}

\item{verbose}{(\code{flag})\cr defaults to \code{TRUE}, which prints out warnings and messages. It is mainly used
to print out information about factor casting.}

\item{.df_row}{(\code{data.frame})\cr data frame across all of the columns for the given row split.}

\item{.ref_group}{(\code{data.frame} or \code{vector})\cr the data corresponding to the reference group.}

\item{.in_ref_col}{(\code{flag})\cr \code{TRUE} when working with the reference level, \code{FALSE} otherwise.}

\item{compare}{(\code{flag})\cr whether comparison statistics should be analyzed instead of summary statistics
(\code{compare = TRUE} adds \code{pval} statistic comparing against reference group).}
}
\value{
\itemize{
\item \code{analyze_vars()} returns a layout object suitable for passing to further layouting functions,
or to \code{\link[rtables:build_table]{rtables::build_table()}}. Adding this function to an \code{rtable} layout will add formatted rows containing
the statistics from \code{s_summary()} to the table layout.
}

\itemize{
\item \code{s_summary()} returns different statistics depending on the class of \code{x}.
}

\itemize{
\item If \code{x} is of class \code{numeric}, returns a \code{list} with the following named \code{numeric} items:
\itemize{
\item \code{n}: The \code{\link[=length]{length()}} of \code{x}.
\item \code{sum}: The \code{\link[=sum]{sum()}} of \code{x}.
\item \code{mean}: The \code{\link[=mean]{mean()}} of \code{x}.
\item \code{sd}: The \code{\link[stats:sd]{stats::sd()}} of \code{x}.
\item \code{se}: The standard error of \code{x} mean, i.e.: (\code{sd(x) / sqrt(length(x))}).
\item \code{mean_sd}: The \code{\link[=mean]{mean()}} and \code{\link[stats:sd]{stats::sd()}} of \code{x}.
\item \code{mean_se}: The \code{\link[=mean]{mean()}} of \code{x} and its standard error (see above).
\item \code{mean_ci}: The CI for the mean of \code{x} (from \code{\link[=stat_mean_ci]{stat_mean_ci()}}).
\item \code{mean_sei}: The SE interval for the mean of \code{x}, i.e.: (\code{\link[=mean]{mean()}} -/+ \code{\link[stats:sd]{stats::sd()}} / \code{\link[=sqrt]{sqrt()}}).
\item \code{mean_sdi}: The SD interval for the mean of \code{x}, i.e.: (\code{\link[=mean]{mean()}} -/+ \code{\link[stats:sd]{stats::sd()}}).
\item \code{mean_pval}: The two-sided p-value of the mean of \code{x} (from \code{\link[=stat_mean_pval]{stat_mean_pval()}}).
\item \code{median}: The \code{\link[stats:median]{stats::median()}} of \code{x}.
\item \code{mad}: The median absolute deviation of \code{x}, i.e.: (\code{\link[stats:median]{stats::median()}} of \code{xc},
where \code{xc} = \code{x} - \code{\link[stats:median]{stats::median()}}).
\item \code{median_ci}: The CI for the median of \code{x} (from \code{\link[=stat_median_ci]{stat_median_ci()}}).
\item \code{quantiles}: Two sample quantiles of \code{x} (from \code{\link[stats:quantile]{stats::quantile()}}).
\item \code{iqr}: The \code{\link[stats:IQR]{stats::IQR()}} of \code{x}.
\item \code{range}: The \code{\link[=range_noinf]{range_noinf()}} of \code{x}.
\item \code{min}: The \code{\link[=max]{max()}} of \code{x}.
\item \code{max}: The \code{\link[=min]{min()}} of \code{x}.
\item \code{median_range}: The \code{\link[=median]{median()}} and \code{\link[=range_noinf]{range_noinf()}} of \code{x}.
\item \code{cv}: The coefficient of variation of \code{x}, i.e.: (\code{\link[stats:sd]{stats::sd()}} / \code{\link[=mean]{mean()}} * 100).
\item \code{geom_mean}: The geometric mean of \code{x}, i.e.: (\code{exp(mean(log(x)))}).
\item \code{geom_cv}: The geometric coefficient of variation of \code{x}, i.e.: (\code{sqrt(exp(sd(log(x)) ^ 2) - 1) * 100}).
}
}

\itemize{
\item If \code{x} is of class \code{factor} or converted from \code{character}, returns a \code{list} with named \code{numeric} items:
\itemize{
\item \code{n}: The \code{\link[=length]{length()}} of \code{x}.
\item \code{count}: A list with the number of cases for each level of the factor \code{x}.
\item \code{count_fraction}: Similar to \code{count} but also includes the proportion of cases for each level of the
factor \code{x} relative to the denominator, or \code{NA} if the denominator is zero.
}
}

\itemize{
\item If \code{x} is of class \code{logical}, returns a \code{list} with named \code{numeric} items:
\itemize{
\item \code{n}: The \code{\link[=length]{length()}} of \code{x} (possibly after removing \code{NA}s).
\item \code{count}: Count of \code{TRUE} in \code{x}.
\item \code{count_fraction}: Count and proportion of \code{TRUE} in \code{x} relative to the denominator, or \code{NA} if the
denominator is zero. Note that \code{NA}s in \code{x} are never counted or leading to \code{NA} here.
}
}

\itemize{
\item \code{a_summary()} returns the corresponding list with formatted \code{\link[rtables:CellValue]{rtables::CellValue()}}.
}
}
\description{
\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#stable}{\figure{lifecycle-stable.svg}{options: alt='[Stable]'}}}{\strong{[Stable]}}

The analyze function \code{\link[=analyze_vars]{analyze_vars()}} generates a summary of one or more variables, using the S3 generic function
\code{\link[=s_summary]{s_summary()}} to calculate a list of summary statistics. A list of all available statistics for numeric
variables can be viewed by running \code{get_stats("analyze_vars_numeric")} and for non-numeric variables by running
\code{get_stats("analyze_vars_counts")}. Use the \code{.stats} parameter to specify the statistics to include in your output
summary table.
}
\details{
\strong{Automatic digit formatting:} The number of digits to display can be automatically determined from the analyzed
variable(s) (\code{vars}) for certain statistics by setting the statistic format to \code{"auto"} in \code{.formats}.
This utilizes the \code{\link[=format_auto]{format_auto()}} formatting function. Note that only data for the current row & variable (for all
columns) will be considered (\code{.df_row[[.var]]}, see \code{\link[rtables:additional_fun_params]{rtables::additional_fun_params}}) and not the whole dataset.
}
\section{Functions}{
\itemize{
\item \code{analyze_vars()}: Layout-creating function which can take statistics function arguments
and additional format arguments. This function is a wrapper for \code{\link[rtables:analyze]{rtables::analyze()}}.

\item \code{s_summary()}: S3 generic function to produces a variable summary.

\item \code{s_summary(numeric)}: Method for \code{numeric} class.

\item \code{s_summary(factor)}: Method for \code{factor} class.

\item \code{s_summary(character)}: Method for \code{character} class. This makes an automatic
conversion to factor (with a warning) and then forwards to the method for factors.

\item \code{s_summary(logical)}: Method for \code{logical} class.

\item \code{a_summary()}: Formatted analysis function which is used as \code{afun} in \code{analyze_vars()} and
\code{compare_vars()} and as \code{cfun} in \code{summarize_colvars()}.

}}
\note{
\itemize{
\item If \code{x} is an empty vector, \code{NA} is returned. This is the expected feature so as to return \code{rcell} content in
\code{rtables} when the intersection of a column and a row delimits an empty data selection.
\item When the \code{mean} function is applied to an empty vector, \code{NA} will be returned instead of \code{NaN}, the latter
being standard behavior in R.
}

\itemize{
\item If \code{x} is an empty \code{factor}, a list is still returned for \code{counts} with one element
per factor level. If there are no levels in \code{x}, the function fails.
\item If factor variables contain \code{NA}, these \code{NA} values are excluded by default. To include \code{NA} values
set \code{na.rm = FALSE} and missing values will be displayed as an \code{NA} level. Alternatively, an explicit
factor level can be defined for \code{NA} values during pre-processing via \code{\link[=df_explicit_na]{df_explicit_na()}} - the
default \code{na_level} (\code{"<Missing>"}) will also be excluded when \code{na.rm} is set to \code{TRUE}.
}

\itemize{
\item Automatic conversion of character to factor does not guarantee that the table
can be generated correctly. In particular for sparse tables this very likely can fail.
It is therefore better to always pre-process the dataset such that factors are manually
created from character variables before passing the dataset to \code{\link[rtables:build_table]{rtables::build_table()}}.
}

\itemize{
\item To use for comparison (with additional p-value statistic), parameter \code{compare} must be set to \code{TRUE}.
\item Ensure that either all \code{NA} values are converted to an explicit \code{NA} level or all \code{NA} values are left as is.
}
}
\examples{
## Fabricated dataset.
dta_test <- data.frame(
  USUBJID = rep(1:6, each = 3),
  PARAMCD = rep("lab", 6 * 3),
  AVISIT  = rep(paste0("V", 1:3), 6),
  ARM     = rep(LETTERS[1:3], rep(6, 3)),
  AVAL    = c(9:1, rep(NA, 9))
)

# `analyze_vars()` in `rtables` pipelines
## Default output within a `rtables` pipeline.
l <- basic_table() \%>\%
  split_cols_by(var = "ARM") \%>\%
  split_rows_by(var = "AVISIT") \%>\%
  analyze_vars(vars = "AVAL")

build_table(l, df = dta_test)

## Select and format statistics output.
l <- basic_table() \%>\%
  split_cols_by(var = "ARM") \%>\%
  split_rows_by(var = "AVISIT") \%>\%
  analyze_vars(
    vars = "AVAL",
    .stats = c("n", "mean_sd", "quantiles"),
    .formats = c("mean_sd" = "xx.x, xx.x"),
    .labels = c(n = "n", mean_sd = "Mean, SD", quantiles = c("Q1 - Q3"))
  )

build_table(l, df = dta_test)

## Use arguments interpreted by `s_summary`.
l <- basic_table() \%>\%
  split_cols_by(var = "ARM") \%>\%
  split_rows_by(var = "AVISIT") \%>\%
  analyze_vars(vars = "AVAL", na.rm = FALSE)

build_table(l, df = dta_test)

## Handle `NA` levels first when summarizing factors.
dta_test$AVISIT <- NA_character_
dta_test <- df_explicit_na(dta_test)
l <- basic_table() \%>\%
  split_cols_by(var = "ARM") \%>\%
  analyze_vars(vars = "AVISIT", na.rm = FALSE)

build_table(l, df = dta_test)

# auto format
dt <- data.frame("VAR" = c(0.001, 0.2, 0.0011000, 3, 4))
basic_table() \%>\%
  analyze_vars(
    vars = "VAR",
    .stats = c("n", "mean", "mean_sd", "range"),
    .formats = c("mean_sd" = "auto", "range" = "auto")
  ) \%>\%
  build_table(dt)

# `s_summary.numeric`

## Basic usage: empty numeric returns NA-filled items.
s_summary(numeric())

## Management of NA values.
x <- c(NA_real_, 1)
s_summary(x, na.rm = TRUE)
s_summary(x, na.rm = FALSE)

x <- c(NA_real_, 1, 2)
s_summary(x, stats = NULL)

## Benefits in `rtables` contructions:
dta_test <- data.frame(
  Group = rep(LETTERS[1:3], each = 2),
  sub_group = rep(letters[1:2], each = 3),
  x = 1:6
)

## The summary obtained in with `rtables`:
basic_table() \%>\%
  split_cols_by(var = "Group") \%>\%
  split_rows_by(var = "sub_group") \%>\%
  analyze(vars = "x", afun = s_summary) \%>\%
  build_table(df = dta_test)

## By comparison with `lapply`:
X <- split(dta_test, f = with(dta_test, interaction(Group, sub_group)))
lapply(X, function(x) s_summary(x$x))

# `s_summary.factor`

## Basic usage:
s_summary(factor(c("a", "a", "b", "c", "a")))

# Empty factor returns zero-filled items.
s_summary(factor(levels = c("a", "b", "c")))

## Management of NA values.
x <- factor(c(NA, "Female"))
x <- explicit_na(x)
s_summary(x, na.rm = TRUE)
s_summary(x, na.rm = FALSE)

## Different denominators.
x <- factor(c("a", "a", "b", "c", "a"))
s_summary(x, denom = "N_row", .N_row = 10L)
s_summary(x, denom = "N_col", .N_col = 20L)

# `s_summary.character`

## Basic usage:
s_summary(c("a", "a", "b", "c", "a"), .var = "x", verbose = FALSE)
s_summary(c("a", "a", "b", "c", "a", ""), .var = "x", na.rm = FALSE, verbose = FALSE)

# `s_summary.logical`

## Basic usage:
s_summary(c(TRUE, FALSE, TRUE, TRUE))

# Empty factor returns zero-filled items.
s_summary(as.logical(c()))

## Management of NA values.
x <- c(NA, TRUE, FALSE)
s_summary(x, na.rm = TRUE)
s_summary(x, na.rm = FALSE)

## Different denominators.
x <- c(TRUE, FALSE, TRUE, TRUE)
s_summary(x, denom = "N_row", .N_row = 10L)
s_summary(x, denom = "N_col", .N_col = 20L)

a_summary(factor(c("a", "a", "b", "c", "a")), .N_row = 10, .N_col = 10)
a_summary(
  factor(c("a", "a", "b", "c", "a")),
  .ref_group = factor(c("a", "a", "b", "c")), compare = TRUE
)

a_summary(c("A", "B", "A", "C"), .var = "x", .N_col = 10, .N_row = 10, verbose = FALSE)
a_summary(
  c("A", "B", "A", "C"),
  .ref_group = c("B", "A", "C"), .var = "x", compare = TRUE, verbose = FALSE
)

a_summary(c(TRUE, FALSE, FALSE, TRUE, TRUE), .N_row = 10, .N_col = 10)
a_summary(
  c(TRUE, FALSE, FALSE, TRUE, TRUE),
  .ref_group = c(TRUE, FALSE), .in_ref_col = TRUE, compare = TRUE
)

a_summary(rnorm(10), .N_col = 10, .N_row = 20, .var = "bla")
a_summary(rnorm(10, 5, 1), .ref_group = rnorm(20, -5, 1), .var = "bla", compare = TRUE)

}
