% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/smart_permanova.R
\name{smart_permanova}
\alias{smart_permanova}
\title{Smart Permutational Multivariate Analysis of Variance}
\arguments{
\item{snp_data}{File name read from working directory.
SNP = rows, samples = columns without row names or column headings.
SNP values must be count data (no decimals allowed).
File extension detected automatically whether text or \code{EIGENSTRAT}.
See details.}

\item{packed_data}{Logical value for \code{EIGENSTRAT}, irrelevant for text data.
Default \code{packed_data = FALSE} assumes uncompressed \code{EIGENSTRAT}.
\code{packed_data = TRUE} for compressed or binary \code{EIGENSTRAT} (\code{PACKENDANCESTRYMAP}).}

\item{sample_group}{Character or numeric vector assigning samples to groups. Coerced to factor.}

\item{sample_remove}{Logical \code{FALSE} or numeric vector indicating column numbers (samples) to be removed from computations.
Default \code{sample_remove =  FALSE} keeps all samples.}

\item{snp_remove}{Logical \code{FALSE} or numeric vector indicating row numbers (SNPs) to be removed from computations.
Default \code{snp_remove =  FALSE} keeps all SNPs. See details.}

\item{missing_value}{Number \code{9} or string \code{NA} indicating missing value.
Default \code{missing_value = 9} as in \code{EIGENSTRAT}.
If no missing values present, no effect on computation.}

\item{missing_impute}{String handling missing values.
Default \code{missing_impute = "mean"} replaces missing values of each SNP by mean of non-missing values across samples.
\code{missing_impute = "remove"} removes SNPs with at least one missing value.
If no missing values present, no effect on computation.}

\item{scaling}{String. Default \code{scaling = "drift"} scales SNPs to control for expected allele frequency dispersion caused by genetic drift (SMARTPCA).
\code{scaling = "center"} for \code{centering} (covariance-based PCA).
\code{scaling = "sd"} for \code{centered} SNPs divided by standard deviation (correlation-based PCA).
\code{scaling = "none"} for no scaling.
See details.}

\item{sample_distance}{Type of inter-sample proximity computed (distance, similarity, dissimilarity).
Default is \code{Euclidean distance}. See details.}

\item{program_distance}{A string value indicating R package to estimate proximities between pairs of samples.
Default \code{program_distance = "Rfast"} uses function \code{\link[Rfast]{Dist}}; \code{program_distance = "vegan"} uses \code{\link[vegan]{vegdist}}.
See details.}

\item{target_space}{String.
Default \code{target_space = "multidimensional"} applies PERMANOVA to sample-by-sample triangular matrix computed from variable-by-sample data, \code{pc_axes} has no effect on computation.
\code{target_space = "pca"} applies PERMANOVA to sample-by-sample data in PCA space, \code{pc_axes} determines number of PCA axes for testing.}

\item{pc_axes}{Number of PCA axes computed always starting with PCA axis 1. Default \code{pc_axes = 2} computes PCA axes 1 and 2 if \code{target_space = "pca"}.
No effect on computation if \code{target_space = "multidimensional"}.}

\item{pairwise}{Logical.
Default \code{pairwise = FALSE} computes global test.
\code{pairwise = TRUE} computes global and pairwise tests.}

\item{pairwise_method}{String specifying type of correction for multiple testing.
Default \code{"holm"}.
See details.}

\item{permutation_n}{Number of permutations resulting in PERMANOVA test \emph{p value}.
Default \code{9999}.}

\item{permutation_seed}{Number fixing random generator of permutations.
Default \code{1}.}
}
\value{
Returns a list containing the following elements:
\itemize{
  \item{permanova.samples}{Dataframe showing sample summary.
  Column \emph{Group} assigns samples to tested groups.
  Column \emph{Class} specifies if samples were used in, or removed from, testing.}
  \item{permanova.global_test}{List showing table with degrees of freedom, sum of squares, mean sum of squares, \emph{F} statistic, variance explained (\emph{R2}) and \emph{p} value.}
  \item{permanova.pairwise_test}{List showing table \emph{F} statistic, variance explained (\emph{R2}), \emph{p} value and corrected \emph{p} value per pair of groups.
  Obtained only if \code{pairwise = TRUE}.}
  \item{permanova.pairwise_correction}{String indicating type of correction for multiple testing.}
  \item{permanova.permutation_number}{Number of permutations applied to obtain the distribution of \emph{p value}.}
  \item{permanova.permutation_seed}{Number fixing random generator of permutations for reproducibility of results.}
}
}
\description{
Computes Permutational Multivariate Analysis of Variance (PERMANOVA) for testing differences in group location using multivariate data. Variance partitioning computed on a sample-by-sample triangular matrix obtained from variable-by-sample data following Anderson (2001).
Calculates a range of inter-sample distances, similarities and dissimilarities.
Includes control for genetic drift for bi-allelic genetic markers such as single nucleotide polymorphisms (SNP) following Patterson, Price and Reich (2006) that can be combined with SMART Principal Component Analysis (PCA). Optimized to run fast matrix building and permutations for big datasets in ecological, evolutionary and genomic research.
}
\details{
PERMANOVA is a form of linear modelling that partitions variation in a triangular matrix of inter-sample proximities obtained from variable-by-sample data.
Uses permutations to estimate the probability of observed group differences in SNP composition given a null hypothesis of no differences between groups (Anderson 2001).
Proximity between samples can be any type of distance, similarity or dissimilarity.
Original acronym \code{NPMANOVA} (Non-Parametric MANOVA) replaced with PERMANOVA (Anderson 2004, 2017).\cr

Univariate ANOVA captures differences in mean and variance referred to as location and dispersion in PERMANOVA's multivariate context (Anderson & Walsh 2013, Warton, Wright and Wang 2012).
To attribute group differences to location (position of sample groups) and/or dispersion (spread of sample groups), PERMANOVA must be combined with PERMDISP as implemented through \code{smart_permdisp}.\cr

Function \code{smart_permanova} uses \code{\link[vegan]{adonis}} to fit formula \code{snp_eucli ~ sample_group}, where \code{snp_eucli} is the sample-by-sample triangular matrix in Principal Coordinate Analysis (Gower 1966) space.
Current version restricted to one-way designs (one categorical predictor) though PERMANOVA can handle >1 crossed and/or nested factors (Anderson 2001) and continuous predictors (McArdle & Anderson 2001).
If >2 sample groups tested, \code{pairwise = TRUE} allows pairwise testing and correction for multiple testing by \code{holm (Holm)} [default], \code{hochberg (Hochberg)}, \code{hommel (Hommel)}, \code{bonferroni (Bonferroni)}, \code{BY (Benjamini-Yekuieli)}, \code{BH (Benjamini-Hochberg)} or \code{fdr (False Discovery Rate)}.\cr

For big data, \code{\link[Rfast]{Dist}} builds sample-by-sample triangular matrix much faster than \code{\link[vegan]{vegdist}}.
\code{\link[Rfast]{Dist}} computes proximities \code{euclidean}, \code{manhattan}, \code{canberra1}, \code{canberra2}, \code{minimum}, \code{maximum}, \code{minkowski}, \code{bhattacharyya}, \code{hellinger}, \code{kullback_leibler} and \code{jensen_shannon}. \code{\link[vegan]{vegdist}} computes \code{manhattan}, \code{euclidean}, \code{canberra}, \code{clark}, \code{bray}, \code{kulczynski}, \code{jaccard}, \code{gower}, \code{altGower}, \code{morisita}, \code{horn}, \code{mountford}, \code{raup}, \code{binomial}, \code{chao}, \code{cao} and \code{mahalanobis}.
Euclidean distance required for SMARTPCA scaling.\cr

\code{sample_remove} should include both samples removed from PCA and ancient samples projected onto PCA space (if any).\cr

Data read from working directory with SNPs as rows and samples as columns.
Two alternative formats: (1) text file of SNPs by samples (file extension and column separators recognized automatically) read using \code{\link[data.table]{fread}}; or (2) duet of \code{EIGENSTRAT} files (see \url{https://reich.hms.harvard.edu/software}) using \code{\link[vroom]{vroom_fwf}}, including a genotype file of SNPs by samples (\code{*.geno}), and a sample file (\code{*.ind}) containing three vectors assigning individual samples to unique user-predefined groups (populations), sexes (or other user-defined descriptor) and alphanumeric identifiers.
For \code{EIGENSTRAT}, vector \code{sample_group} assigns samples to groups retrievable from column 3 of file \code{*.ind}.
SNPs with zero variance removed prior to SVD to optimize computation time and avoid undefined values if \code{scaling = "sd"} or \code{"drift"}.\cr

Users can select subsets of samples or SNPs by introducing a vector including column numbers for samples (\code{sample_remove}) and/or row numbers for SNPs (\code{snp_remove}) to be removed from computations.
Function stops if the final number of SNPs is 1 or 2.
\code{EIGENSOFT} was conceived for the analysis of human genes and its SMARTPCA suite so accepts 22 (autosomal) chromosomes by default.
If >22 chromosomes are provided and the internal parameter \code{numchrom} is not set to the target number chromosomes of interest, SMARTPCA automatically subsets chromosomes 1 to 22.
In contrast, \code{smart_permanova} accepts any number of autosomes with or without the sex chromosomes from an \code{EIGENSTRAT} file.\cr
}
\examples{
# Path to example genotype matrix "dataSNP"
pathToGenoFile = system.file("extdata", "dataSNP", package = "smartsnp")

# Assign 50 samples to each of two groups
my_groups <- as.factor(c(rep("A", 50), rep("B", 50)))

# Run PERMANOVA
permanovaR <- smart_permanova(snp_data = pathToGenoFile, sample_group = my_groups)

# Extract summary table assigning samples to groups
permanovaR$permanova.samples

# Extract PERMANOVA table
permanovaR$permanova.global_test

# Plot means of squares per group
#run pca with truncated SVD (PCA 1 x PCA 2)
pcaR1 <- smart_pca(snp_data = pathToGenoFile, sample_group = my_groups)
#compute Euclidean inter-sample distances in PCA space (triangular matrix)
snp_eucli <- vegan::vegdist(pcaR1$pca.sample_coordinates[,c("PC1","PC2")], method = "euclidean")
#run PERMANOVA
permanova <- vegan::adonis(formula = snp_eucli ~ my_groups, permutations = 9999)
#extract meanSqs (groups versus residuals)
meanSqs <- as.matrix(t(permanova$aov.tab$MeanSqs[1:2]))
colnames(meanSqs) <- c("Groups", "Residuals")
#two horizontal plots
oldpar <- par(mfrow = c(2,1), oma = c(0,5,0.1,0.1), lwd = 2)
barplot(meanSqs, horiz = TRUE, main = "PERMANOVA mean of squares",
  cex.names = 2, cex.main = 2, col = c("grey40"))
#run ANOSIM
anosimD <- vegan::anosim(snp_eucli, my_groups, permutations = 999)
#remove outputs for clean plotting
#anosimD[2] <- ""; anosimD[5] <- ""
par(mar = c(5, 0.1, 3.5, 0.1))
plot(anosimD, xlab = "", ylab = "distance/similarity ranks",
  main = "Inter-sample proximity ranks", cex.main =2, cex.axis = 2,
  col = c("cyan", "red", "blue"))
par(oldpar)

}
\references{
Anderson, M. J. (2001) A new method for non-parametric multivariate analysis of variance. Austral Ecology, 26, 32-46.\cr
Anderson, M. J. (2004). PERMANOVA_2factor: a FORTRAN computer program for permutational multivariate analysis of variance (for any two-factor ANOVA design) using permutation tests  (Department of Statistics, University of Auckland, New Zealand).\cr
Anderson, M. J. & D. C. I. Walsh (2013) PERMANOVA, ANOSIM, and the Mantel test in the face of heterogeneous dispersions: What null hypothesis are you testing? Ecological Monographs, 83, 557-574.\cr
Gower, J. C. (1966) Some distance properties of latent root and vector methods used in multivariate analysis. Biometrika, 53, 325-338.\cr
McArdle, B. H. & M. J. Anderson (2001) Fitting multivariate models to community data: a comment on distance-based redundancy analysis. Ecology, 82, 290-297.\cr
Patterson, N., A. L. Price and D. Reich (2006) Population structure and eigenanalysis. PLoS Genetics, 2, e190.\cr
Warton, D. I., S. T. Wright and Y. Wang (2012) Distance-based multivariate analyses confound location and dispersion effects. Methods in Ecology and Evolution, 3, 89-101.
}
\seealso{
\code{\link[vegan]{adonis}} (package \bold{vegan}),
\code{\link[Rfast]{Dist}} (package \bold{Rfast}),
\code{\link[data.table]{fread}} (package \bold{data.table}),
\code{\link[vegan]{vegdist}} (package \bold{vegan}),
\code{\link[vroom]{vroom_fwf}} (package \bold{vroom})
}
