% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/tokens_ngrams.R
\name{tokens_ngrams}
\alias{tokens_ngrams}
\alias{char_ngrams}
\alias{tokens_skipgrams}
\title{Create n-grams and skip-grams from tokens}
\usage{
tokens_ngrams(
  x,
  n = 2L,
  skip = 0L,
  concatenator = concat(x),
  apply_if = NULL,
  verbose = quanteda_options("verbose")
)

char_ngrams(x, n = 2L, skip = 0L, concatenator = "_")

tokens_skipgrams(
  x,
  n,
  skip,
  concatenator = concat(x),
  apply_if = NULL,
  verbose = quanteda_options("verbose")
)
}
\arguments{
\item{x}{a tokens object, or a character vector, or a list of characters}

\item{n}{integer vector specifying the number of elements to be concatenated
in each n-gram.  Each element of this vector will define a \eqn{n} in the
\eqn{n}-gram(s) that are produced.}

\item{skip}{integer vector specifying the adjacency skip size for tokens
forming the n-grams, default is 0 for only immediately neighbouring words.
For \code{skipgrams}, \code{skip} can be a vector of integers, as the
"classic" approach to forming skip-grams is to set skip = \eqn{k} where
\eqn{k} is the distance for which \eqn{k} or fewer skips are used to
construct the \eqn{n}-gram.  Thus a "4-skip-n-gram" defined as \code{skip = 0:4} produces results that include 4 skips, 3 skips, 2 skips, 1 skip, and 0
skips (where 0 skips are typical n-grams formed from adjacent words).  See
Guthrie et al (2006).}

\item{concatenator}{character for combining words, default is \verb{_}
(underscore) character}

\item{apply_if}{logical vector of length \code{ndoc(x)}; documents are modified
only when corresponding values are \code{TRUE}, others are left unchanged.}

\item{verbose}{if \code{TRUE} print the number of tokens and documents before and
after the function is applied. The number of tokens does not include paddings.}
}
\value{
a tokens object consisting a list of character vectors of n-grams, one
list element per text, or a character vector if called on a simple
character vector
}
\description{
Create a set of n-grams (tokens in sequence) from already tokenized text
objects, with an optional skip argument to form skip-grams. Both the n-gram
length and the skip lengths take vectors of arguments to form multiple
lengths or skips in one pass.  Implemented in C++ for efficiency.
}
\details{
Normally, these functions will be called through
\verb{[tokens](x, ngrams = , ...)}, but these functions are provided
in case a user wants to perform lower-level n-gram construction on tokenized
texts.

\code{\link[=tokens_skipgrams]{tokens_skipgrams()}} is a wrapper to \code{\link[=tokens_ngrams]{tokens_ngrams()}} that requires
arguments to be supplied for both \code{n} and \code{skip}. For \eqn{k}-skip
skip-grams, set \code{skip} to \verb{0:}\eqn{k}, in order to conform to the
definition of skip-grams found in Guthrie et al (2006): A \eqn{k} skip-gram
is an n-gram which is a superset of all n-grams and each \eqn{(k-i)}
skip-gram until \eqn{(k-i)==0} (which includes 0 skip-grams).
}
\note{
\code{char_ngrams} is a convenience wrapper for a (non-list)
vector of characters, so named to be consistent with \pkg{quanteda}'s naming
scheme.
}
\examples{
# ngrams
tokens_ngrams(tokens(c("a b c d e", "c d e f g")), n = 2:3)

toks <- tokens(c(text1 = "the quick brown fox jumped over the lazy dog"))
tokens_ngrams(toks, n = 1:3)
tokens_ngrams(toks, n = c(2,4), concatenator = " ")
tokens_ngrams(toks, n = c(2,4), skip = 1, concatenator = " ")
# skipgrams
toks <- tokens("insurgents killed in ongoing fighting")
tokens_skipgrams(toks, n = 2, skip = 0:1, concatenator = " ")
tokens_skipgrams(toks, n = 2, skip = 0:2, concatenator = " ")
tokens_skipgrams(toks, n = 3, skip = 0:2, concatenator = " ")
}
\references{
Guthrie, David, Ben Allison, Wei Liu, Louise Guthrie, and Yorick Wilks. 2006.
"A Closer Look at Skip-Gram Modelling." \verb{https://aclanthology.org/L06-1210/}
}
