% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/fusedTree.R
\name{CVfoldsTree}
\alias{CVfoldsTree}
\title{Create balanced cross-validation folds for hyperparameter tuning}
\usage{
CVfoldsTree(Y, Tree, Z, model = NULL, kfold = 5, nrepeat = 3)
}
\arguments{
\item{Y}{The response variable. Should be one of:
\itemize{
\item Numeric (for linear regression),
\item Binary (encoded as 0 and 1, for logistic regression),
\item A survival object created using \code{Surv()} (for Cox regression).
}
Only right-censored survival data is currently supported.}

\item{Tree}{A fitted tree object, created using \pkg{rpart} or \pkg{partykit}.
Must be an object of class \code{"rpart"} (from the \pkg{rpart} package) or
\code{"constparty"} (from the \pkg{partykit} package).}

\item{Z}{A \code{data.frame} of clinical variables used to fit the tree.
This is used to determine node membership for balancing folds.}

\item{model}{Character. Specifies the type of outcome model. Must be one of:
\code{"linear"}, \code{"logistic"}, or \code{"cox"}.}

\item{kfold}{Integer. Number of folds K for cross-validation. Defaults to 5.}

\item{nrepeat}{Integer. Number of times the K-fold cross-validation is repeated.
Defaults to 3.}
}
\value{
A list of length \code{kfold × nrepeat}, where each element contains
the test indices for a specific fold. These indices can be used to
systematically split the data during cross-validation.
}
\description{
Constructs repeated K-fold cross-validation folds, balanced with respect to
the fitted tree structure and outcome (if applicable). The folds contain only
the test sample indices. This function is useful for tuning penalty parameters
in the fusedTree model.
}
\details{
For binary and survival outcomes, the function ensures that the proportion
of cases vs. controls (or events vs. censored observations) remains
relatively constant across folds. In addition, samples are balanced across
the leaf nodes of the fitted tree to ensure consistency in node composition
between folds.
}
\examples{
p = 5 # number of omics variables (low for illustration)
p_Clin = 5 # number of clinical variables
N = 100 # sample size
# simulate from Friedman-like function
g <- function(z) {
  15 * sin(pi * z[,1] * z[,2]) + 10 * (z[,3] - 0.5)^2 + 2 * exp(z[,4]) + 2 * z[,5]
}
Z <- as.data.frame(matrix(runif(N * p_Clin), nrow = N))
X <- matrix(rnorm(N * p), nrow = N)            # omics data
betas <- c(1,-1,3,4,2)                         # omics effects
Y <- g(Z) + X \%*\% betas + rnorm(N)             # continuous outcome
Y <- as.vector(Y)
dat = cbind.data.frame(Y, Z) #set-up data correctly for rpart
rp <- rpart::rpart(Y ~ ., data = dat,
                   control = rpart::rpart.control(xval = 5, minbucket = 10),
                   model = TRUE)
cp = rp$cptable[,1][which.min(rp$cptable[,4])] # best model according to pruning
Treefit <- rpart::prune(rp, cp = cp)
plot(Treefit)
folds <- CVfoldsTree(Y = Y, Tree = Treefit, Z = Z, model = "linear")
}
