% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/runSimulation.R
\name{runSimulation}
\alias{runSimulation}
\title{Run a Monte Carlo simulation given a data.frame of conditions and simulation functions}
\usage{
runSimulation(design, replications, generate, analyse, summarise,
  fixed_objects = NULL, parallel = FALSE, packages = NULL,
  ncores = parallel::detectCores(), MPI = FALSE, save = FALSE,
  save_results = FALSE, save_generate_data = FALSE, filename = NULL,
  max_errors = 50, include_errors = TRUE, seed = NULL,
  save_details = list(), edit = "none", verbose = TRUE)
}
\arguments{
\item{design}{a \code{data.frame} object containing the Monte Carlo simulation conditions to
be studied, where each row represents a unique condition}

\item{replications}{number of replication to perform per condition (i.e., each row in \code{design}).
Must be greater than 0}

\item{generate}{user-defined data and parameter generating function.
See \code{\link{generate}} for details}

\item{analyse}{user-defined computation function which acts on the data generated from
\code{\link{generate}}. See \code{\link{analyse}} for details}

\item{summarise}{user-defined summary function to be used after all the replications have completed.
See \code{\link{summarise}} for details}

\item{fixed_objects}{(optional) an object (usually a \code{list})
containing additional user-defined objects
that should remain fixed across conditions. This is useful when including
long fixed vectors of population parameters, data
that should be used across all conditions and replications (e.g., including a fixed design matrix
for linear regression), or simply can be used to control constant global elements such as sample size}

\item{parallel}{logical; use parallel processing from the \code{parallel} package over each
unique condition?}

\item{packages}{a character vector of external packages to be used during the simulation (e.g.,
\code{c('MASS', 'mvtnorm', 'simsem')} ). Use this input when \code{parallel = TRUE} or
\code{MPI = TRUE} to use non-standard functions from additional packages,
otherwise the functions must be made available by using explicit
\code{\link{library}} or \code{\link{require}} calls within the provided simulation functions.
Alternatively, functions can be called explicitly without attaching the package with \code{::}
(e.g., \code{mvtnorm::rmvnorm()})}

\item{ncores}{number of cores to be used in parallel execution. Default uses all available}

\item{MPI}{logical; use the \code{foreach} package in a form usable by MPI to run simulation
in parallel on a cluster? Default is \code{FALSE}}

\item{save}{logical; save the simulation state to the hard-drive? This is useful
for simulations which require an extended amount of time. When \code{TRUE}, a temp file
will be created in the working directory which allows the simulation state to be saved
and recovered (in case of power outages, crashes, etc). To recover you simulation at the last known
location simply rerun the same code you used to initially define the simulation and the object
will automatically be detected and read-in. Default is \code{FALSE}}

\item{save_results}{logical; save the results returned from \code{\link{analyse}} to external
\code{.rds} files located in the defined \code{save_results_dirname} directory/folder?
Use this if you would like to keep track of the individual parameters returned from the analyses.
Each saved object will contain a list of three elements containing the condition (row from \code{design}),
results (as a \code{list} or \code{matrix}), and try-errors. When \code{TRUE}, a temp file will be used to track the simulation
state (in case of power outages, crashes, etc). Default is \code{FALSE}}

\item{save_generate_data}{logical; save the data returned from \code{\link{generate}} to external \code{.rds} files
located in the defined \code{save_generate_data_dirname} directory/folder?
It is generally recommended to leave this argument as \code{FALSE} because saving datasets will often consume
a large amount of disk space, and by and large saving data is not required or recommended for simulations.
Default is \code{FALSE}}

\item{filename}{the name of the \code{.rds} file to save the final simulation results to.
When \code{NULL} the final simulation object is not saved to the drive. As well,
if the same file name already exists in the working directy at the time of saving then a new
file will be generated instead and a warning will be thrown. This helps avoid accidentally overwritting
existing files. Default is \code{NULL}}

\item{max_errors}{the simulation will terminate when more than this number of errors are thrown in any
given condition. The purpose of this is to indicate that likely something problematic is going
wrong in the generate-analyse phases and should be inspected. Default is 50}

\item{include_errors}{logical; include information about which error how often they occurred?
If \code{TRUE}, this information will be stacked at the end
of the returned simulation results with the name of the specific error used as the column name in the
data.frame object, and the number of occurrences included as the value for each condition.
Default is \code{TRUE}}

\item{seed}{a vector of integers to be used for reproducibility.
The length of the vector must be equal the number of rows in \code{design}.
This argument calls \code{\link{set.seed}} or
\code{\link{clusterSetRNGStream}} for each condition, respectively,
but will not be run when \code{MPI = TRUE}.
Default is \code{NULL}, indicating that no seed is set for each condition}

\item{save_details}{a list pertaining to information about how and where files should be saved
  when \code{save}, \code{save_results}, or \code{save_generate_data} are triggered.

  \describe{

    \item{\code{safe}}{logical; trigger whether safe-saving should be performed. When \code{TRUE} files
      will never be over-written accidentelly, and where apppropriate the program will either stop or generate
      new files with unique names. Default is \code{TRUE}}

    \item{\code{compname}}{name of the computer running the simulation. Normally this doesn't need
      to be modified, but in the event that a node breaks down while running a simulation the
      results from the tmp files may be resumed on another computer by changing the name of the
      node to match the broken computer. Default is \code{unname(Sys.info()['nodename'])}}

    \item{\code{tmpfilename}}{the name of the temporary \code{.rds} file when any of the \code{save} flag is used.
       This file will be read-in if it is in the working directory, and the simulation will continue where
       at the last point this file was saved
       (useful in case of power outages or broken nodes). Finally, this file will be deleted when the
       simulation is complete. Default is the system name (\code{compname}) appended
       to \code{'SIMDESIGN-TEMPFILE_'}}

    \item{\code{save_results_dirname}}{a string indicating the name of the folder to save
      results objects to when \code{save_results = TRUE}. If a directory/folder does not exist
      in the current working directory then one will be created automatically. Default is
      \code{'SimDesign-results_'} with the associated \code{compname} appended}

    \item{\code{save_generate_data_dirname}}{a string indicating the name of the folder to save
      data objects to when \code{save_generate_data = TRUE}. If a directory/folder does not exist
      in the current working directory then one will be created automatically.
      Within this folder nested directories will be created associated with each row in \code{design}.
      Default is \code{'SimDesign-generate-data_'} with the \code{compname} appended}

  }}

\item{edit}{a string indicating where to initiate a \code{browser()} call for editing and debugging.
  General options are \code{'none'} (default) and \code{'all'}, which are used
  to disable debugging and to debug all the user defined functions, respectively.
  Specific options include: \code{'generate'}
  to edit the data simulation function, \code{'analyse'} to edit the computational function, and
  \code{'summarise'} to  edit the aggregation function.

  Alternatively, users may place \code{\link{browser}} calls within the respective functions for
  debugging at specific lines (note: parallel computation flags will automatically be disabled
  when a \code{browser()} is detected)}

\item{verbose}{logical; print messages to the R console? Default is \code{TRUE}}
}
\value{
a \code{data.frame} (also of class \code{'SimDesign'})
  with the original \code{design} conditions in the left-most columns,
  simulation results in the middle columns, additional information (such as REPLICATIONS and SIM_TIME),
  to the right of the results, and ERROR_MESSAGE's in the right-most columns
}
\description{
This function runs a Monte Carlo simulation study given the simulation functions, the design conditions,
and the number of replications. Results can be saved as temporary files in case of interruptions
and may be restored by rerunning the exact function calls again, provided that the respective temp
file can be found in the working directory. To conserve RAM, temporary objects (such as
generated data across conditions and replications) are discarded; however, these can be saved to the
hard-disk by passing the appropriate flags. For longer simulations,
it is recommended to use \code{save = TRUE} to temporarily save the
simulation state. Function supports parallel and cluster computing,
global and local debugging, error handling (including fail-safe
stopping when functions fail too often, even across nodes), and is designed to be cross-platform.
}
\details{
The strategy for organizing the Monte Carlo simulation work-flow is to

\describe{
   \item{1)}{Define a suitable \code{design} data.frame. This is often expedited by using the
      \code{\link{expand.grid}} function}
   \item{2)}{Define the three step functions to simulate the data (\code{\link{generate}}),
      analyse the generated data by computing the respective parameter estimates, detection rates,
      etc (\code{\link{analyse}}), and finally summarise the results across the total
      number of replications (\code{\link{summarise}})
   }
   \item{3)}{Pass the above objects to the \code{runSimulation} function, and define the
      number of replications with the \code{replications} input}
   \item{4)}{Analyze the output from \code{runSimulation}, possibly using ANOVA techniques
     and generating suitable plots and tables}
}

For a skeleton version of the work-flow which may be useful when initially defining a simulation,
see \code{\link{SimDesign_functions}}. This function will write the template of the simulation
to one/two files so that modifying the respective functions and objects can begin immediately and
with minimal error. This means that you can focus on your Monte Carlo simulation right away rather
than worry about the administrative work required to organize the code.

Additional information for each condition are also returned:
\code{REPLICATIONS} to indicate the number of Monte Carlo replications,
\code{SIM_TIME} to indicate how long (in seconds) it took to complete
all the Monte Carlo replications for each respective condition, \code{SEED} if the \code{seed} argument
was used, and, if \code{include_errors = TRUE},
columns containing the number of replications due to \code{try()} errors where the error messages
represent the names of the columns prefixed with a \code{ERROR_MESSAGE} string.

Note that when running simulations in parallel (either with \code{parallel = TRUE} or \code{MPI = TRUE})
R objects defined in the global environment will \emph{not} be visible across nodes. Hence, you may see errors
such as \code{Error: object 'something' not found}. To avoid this, simply pass additional objects to the
\code{fixed_objects} input (usually it's convenient to supply a named list of these objects).
Fortunately, however, \emph{custom functions defined in the global environment are exported across
nodes automatically}. This makes it convenient when writing code because custom functions will
always be available across nodes if they are visible in the R workspace.

Additional examples, presentation files, and tutorials can be found on the package wiki located at
\url{https://github.com/philchalmers/SimDesign/wiki}.
}
\section{Storing and resuming temporary results}{


In the event of a computer crash, power outage, etc, if \code{save = TRUE} was used
then the original code in
the main source file need only be rerun again to resume the simulation.
The saved temp file will be read into the function, and the simulation will continue where it left
off before the simulation was terminated. Upon completion, a data.frame with the simulation
will be returned in the R session. If specified, an \code{.rds} file may also be saved
to the hard-drive if a suitable \code{filename} argument was included.
Finally, to save the complete list of results returned
from \code{\link{analyse}} to unique files use \code{save_results = TRUE}.
}

\section{Cluster computing}{


If the package is installed across a cluster of computers, and all the computers are accessible on
the same LAN network, then the package may be run within the MPI paradigm. This simply
requires that the computers be setup using the usual MPI requirements (typically, running some flavor
of Linux, have password-less open-SSH access, addresses have been added to the \code{/etc/hosts} file, etc).

To setup the R code for an MPI cluster one need only add the argument \code{MPI = TRUE},
wrap the appropriate MPI directives around \code{runSimulation}, and submit the
files using the suitable BASH commands to execute the \code{mpirun} tool. For example,

\describe{
  \item{\code{library(doMPI)}}{}
  \item{\code{cl <- startMPIcluster()}}{}
  \item{\code{registerDoMPI(cl)}}{}
  \item{\code{Final <- runSimulation(design=Design, replications=1000, save=TRUE,
    generate=Generate, analyse=Analyse, summarise=Summarise, MPI=TRUE)}}{}
  \item{\code{saveRDS(Final, 'mysimulation.rds') # alternatively, pass a filename argument}}{}
  \item{\code{closeCluster(cl)}}{}
  \item{\code{mpi.quit()}}{}
}

This file (or files if the simulation script is broken up) needs to be uploaded to the master node,
and a BASH call to \code{mpirun}
is then used to distribute the work across slaves. For instance, if the following BASH command
is run on the master node then 16 processes
will be summoned (1 master, 15 slaves) across the computers named localhost, slave1, and slave2.

\code{mpirun -np 16 -H localhost,slave1,slave2 R --slave -f simulation.R}
}

\section{Poor man's cluster computing for independent nodes}{


In the event that you do not have access to a Beowulf-type cluster but have multiple personal
computers, then the simulation code can be manually distributed across each independent computer instead.
This simply requires passing a smaller value to the \code{replications} argument on each computer, and later
aggregating the results using the \code{\link{aggregate_simulations}} function.

For instance, if you have two computers available and wanted 500 replications you
could pass \code{replications = 300} to one computer and \code{replications = 200} to the other along
with a \code{filename} argument (or simply saving the final objects as \code{.rds} files manually).
This will create two distinct \code{.rds} files which can be
combined later with the \code{\link{aggregate_simulations}} function. The benefit of this approach over
MPI is that computers need not be linked over a LAN network, and should the need arise the temporary
simulation results can be migrated to another computer in case of a complete hardware failure by modifying
the suitable \code{compname} input to \code{save_details} (or, if the \code{filename} and \code{tmpfilename}
were modified, matching those files as well).
}
\examples{

#### Step 1 --- Define your conditions under study and create design data.frame

# (use EXPLICIT names, avoid things like N <- 100. That's fine in functions, not here)
sample_sizes <- c(30, 60, 90, 120)
standard_deviation_ratios <- c(1, 4, 8)
group_size_ratios <- c(.5, 1, 2)

Design <- expand.grid(sample_size=sample_sizes,
                      group_size_ratio=group_size_ratios,
                      standard_deviation_ratio=standard_deviation_ratios)
dim(Design)
head(Design)

#~~~~~~~~~~~~~~~~~~~~~~~~
#### Step 2 --- Define generate, analyse, and summarise functions

# skeleton functions to be edited
SimDesign_functions()

# help(generate)
Generate <- function(condition, fixed_objects = NULL){

    #require packages/define functions if needed, or better yet index with the :: operator

    N <- condition$sample_size
    grs <- condition$group_size_ratio
    sd <- condition$standard_deviation_ratio

    if(grs < 1){
        N2 <- N / (1/grs + 1)
        N1 <- N - N2
    } else {
        N1 <- N / (grs + 1)
        N2 <- N - N1
    }
    group1 <- rnorm(N1)
    group2 <- rnorm(N2, sd=sd)
    dat <- data.frame(group = c(rep('g1', N1), rep('g2', N2)), DV = c(group1, group2))

    return(dat)
}

# help(analyse)

Analyse <- function(condition, dat, fixed_objects = NULL, parameters = NULL){

    # require packages/define functions if needed, or better yet index with the :: operator
    require(stats)
    mygreatfunction <- function(x) print('Do some stuff')

    #wrap computational statistics in try() statements to control estimation problems
    welch <- t.test(DV ~ group, dat)
    ind <- t.test(DV ~ group, dat, var.equal=TRUE)

    # In this function the p values for the t-tests are returned,
    #  and make sure to name each element, for future reference
    ret <- c(welch = welch$p.value, independent = ind$p.value)

    return(ret)
}

# help(summarise)

Summarise <- function(condition, results, fixed_objects = NULL, parameters_list = NULL){

    #find results of interest here (e.g., alpha < .1, .05, .01)
    lessthan.05 <- EDR(results, alpha = .05)

    # return the results that will be appended to the design input
    ret <- c(lessthan.05=lessthan.05)
    return(ret)
}


#~~~~~~~~~~~~~~~~~~~~~~~~
#### Step 3 --- Collect results by looping over the rows in design

# test to see if it works and for debugging
Final <- runSimulation(design=Design, replications=5, parallel=FALSE,
                       generate=Generate, analyse=Analyse, summarise=Summarise)

\dontrun{
# complete run with 1000 replications per condition
Final <- runSimulation(design=Design, replications=1000, parallel=TRUE,
                       generate=Generate, analyse=Analyse, summarise=Summarise)
head(Final)
View(Final)

## save results to a file upon completion (not run)
# runSimulation(design=Design, replications=1000, parallel=TRUE, filename = 'mysim',
#               generate=Generate, analyse=Analyse, summarise=Summarise)



## Debug the generate function. See ?browser for help on debugging
##   Type help to see available commands (e.g., n, c, where, ...),
##   ls() to see what has been defined, and type Q to quit the debugger
runSimulation(design=Design, replications=1000,
              generate=Generate, analyse=Analyse, summarise=Summarise,
              parallel=TRUE, edit='generate')

## Alternatively, place a browser() within the desired function line to
##   jump to a specific location
Summarise <- function(condition, results, parameters_list = NULL){

    #find results of interest here (e.g., alpha < .1, .05, .01)
    nms <- c('welch', 'independent')
    lessthan.05 <- EDR(results[,nms], alpha = .05)

    browser()

    # return the results that will be appended to the design input
    ret <- c(lessthan.05=lessthan.05)
    return(ret)
}

runSimulation(design=Design, replications=1000,
              generate=Generate, analyse=Analyse, summarise=Summarise,
              parallel=TRUE)




## EXTRA: To run the simulation on a MPI cluster, use the following setup on each node (not run)
# library(doMPI)
# cl <- startMPIcluster()
# registerDoMPI(cl)
# Final <- runSimulation(design=Design, replications=1000, MPI=TRUE, save=TRUE,
#                        generate=Generate, analyse=Analyse, summarise=Summarise)
# saveRDS(Final, 'mysim.rds')
# closeCluster(cl)
# mpi.quit()



#~~~~~~~~~~~~~~~~~~~~~~~~
# Step 4 --- Post-analysis: Create a new R file for analyzing the Final data.frame with R based
# regression stuff, so use the lm() function to find main effects, interactions, plots, etc.
# This is where you get to be a data analyst!

psych::describe(Final)
psych::describeBy(Final, group = Final$standard_deviation_ratio)

# make into factors (if helpful)
Final$f_gsr <- with(Final, factor(group_size_ratio))
Final$f_sdr <- with(Final, factor(standard_deviation_ratio))

#lm analysis (might want to change DV to a logit for better stability)
mod <- lm(lessthan.05.welch ~ f_gsr * f_sdr, Final)
car::Anova(mod)

mod2 <- lm(lessthan.05.independent ~ f_gsr * f_sdr, Final)
car::Anova(mod2)

# make some plots
library(ggplot2)
library(reshape2)
welch_ind <- Final[,c('group_size_ratio', "standard_deviation_ratio",
    "lessthan.05.welch", "lessthan.05.independent")]
dd <- melt(welch_ind, id.vars = names(welch_ind)[1:2])

ggplot(dd, aes(factor(group_size_ratio), value)) +
    geom_abline(intercept=0.05, slope=0, col = 'red') +
    geom_abline(intercept=0.075, slope=0, col = 'red', linetype='dotted') +
    geom_abline(intercept=0.025, slope=0, col = 'red', linetype='dotted') +
    geom_boxplot() + facet_wrap(~variable)

ggplot(dd, aes(factor(group_size_ratio), value, fill = factor(standard_deviation_ratio))) +
    geom_abline(intercept=0.05, slope=0, col = 'red') +
    geom_abline(intercept=0.075, slope=0, col = 'red', linetype='dotted') +
    geom_abline(intercept=0.025, slope=0, col = 'red', linetype='dotted') +
    geom_boxplot() + facet_grid(variable~standard_deviation_ratio) +
    theme(legend.position = 'none')

}

}
\seealso{
\code{\link{generate}}, \code{\link{analyse}}, \code{\link{summarise}},
  \code{\link{SimDesign_functions}}
}

