% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/smith_waterman.R
\name{smith_waterman}
\alias{smith_waterman}
\title{Align text using Smith-Waterman}
\usage{
smith_waterman(a, b, type = c("characters", "words"), match = 2L,
  mismatch = -1L, gap = -1L, lower = TRUE, similarity = function(x,
  y) ifelse(x == y, 2L, -1L), tokenizer, collapse, edit_mark = "#",
  implementation = c("R", "Rcpp"))
}
\arguments{
\item{a}{a character string of length one}

\item{b}{a character string of length one}

\item{type}{either 'characters' or 'words' indicating to align based on a sequence of characters or a sequence of words. Defaults to 'characters'.}

\item{match}{integer value of a score to assign a match (a letter/word from a and b which are the same during alignment). This value should be bigger than zero. Defaults to 2.}

\item{mismatch}{integer value of a score to assign a mismatch (leave out 1 word / 1 letter from 1 of the 2 input strings during alignment). This value should be smaller or equal to zero.  Defaults to -1.}

\item{gap}{integer value of a score to assign a gap (drop 1 word / letter from each of the 2 input strings during alignment). This value should be smaller or equal to zero.   Defaults to -1.}

\item{lower}{logical indicating to lowercase text before doing the alignment. Defaults to TRUE.}

\item{similarity}{optionally, a function to compare 2 characters or words. 
This function should have 2 arguments x and y with the 2 letters / words to compare and should return 1 number indicating
the similarity between x and y. See the examples.}

\item{tokenizer}{a function to tokenise text into either a sequence of characters or a sequence of words.
Defaults to \code{\link{tokenize_letters}} if type is \code{'characters'} and \code{\link{tokenize_spaces_punct}} if type is \code{'words'}}

\item{collapse}{separator used to combined characters / words back together in the output. Defaults to '' for type 'characters' and a space for type 'words'}

\item{edit_mark}{separator to indicated a gap/mismatch between sequences. Defaults to the hashtag symbol.}

\item{implementation}{either 'R' or 'Rcpp' indicating to perform the alignment in Rcpp or with plain R code. Defaults to 'R'.}
}
\value{
an object of class smith_waterman which is a list with elements
\itemize{
 \item{type: }{The alignment \code{type}}
 \item{sw: }{The Smith-Waterman local alignment score}
 \item{similarity: }{Score between 0 and 1, calculated as the Smith-Waterman local alignment score / (the number of letters/words in the shortest text times the match weight)}
 \item{weights: }{The list of weights provided to the function: match, mismatch and gap}
 \item{matches: }{The number of matches found during alignment}
 \item{mismatches: }{The number of mismatches found during alignment}
 \item{a: }{A list with alignment information from the text provided in \code{a}. The list elements documented below}
 \item{b: }{A list with alignment information from the text provided in \code{b}. The list elements documented below}
}
Elements \code{a} and \code{b} are both lists which contain
\itemize{
 \item{text: }{The provided character string of either a or b}
 \item{tokens: }{A character vector of the tokenised texts of a or b}
 \item{n: }{The length of \code{tokens}}
 \item{alignment: }{A list with the following elements}
   \itemize{
   \item{text: }{The aligned text from either a or b where gaps/mismatches are filled up with the \code{edit_mark} symbol}
   \item{tokens: }{The character vector of tokens which form the aligned \code{text}}
   \item{n: }{The length of the aligned \code{text}}
   \item{gaps: }{The number of gaps during alignment}
   \item{from: }{The starting position in the full tokenised \code{tokens} element from either a or b where the aligned text is found. See the example.}
   \item{to: }{The end position in the full tokenised \code{tokens} element from either a or b where the aligned text is found. See the example.}
  } 
}
}
\description{
Align text using the Smith-Waterman algorithm. 
The Smith–Waterman algorithm performs local sequence alignment. 
It finds similar regions between two strings.\cr
Similar regions are a sequence of either characters or words which are found by matching the characters or words of 2 sequences of strings.\cr 
If the word/letter is the same in each text, the alignment score is increased with the match score, while if they are not the same the local alignment score drops by the gap score.
If one of the 2 texts contains extra words/letters, the score drops by the mismatch score.
}
\details{
The code uses similar code as the \code{textreuse::local_align} function and also allows to align character sequences next to aligning word sequences
}
\examples{
## align sequence of letters
smith_waterman("Joske Vermeulen", "Jiske Vermoelen")
smith_waterman("Joske Vermeulen", "Ik zoek naar Jiske Versmoelen, waar is die te vinden")
smith_waterman("Joske", "Jiske")
smith_waterman("Joske", "Jiske",
               similarity = function(x, y) ifelse(x == y | (x == "o" & y == "i"), 2L, -1L))

## align sequence of words
a <- "The answer is blowin' in the wind."
b <- "As the Bob Dylan song says, the answer is blowing in the wind."
smith_waterman(a, b)
smith_waterman(a, b, type = "characters")
smith_waterman(a, b, type = "words")
smith_waterman(a, b, type = "words", similarity = function(x, y) adist(x, y))
smith_waterman(a, b, type = "words", 
               tokenizer = function(x) unlist(strsplit(x, "[[:space:]]")))
x <- smith_waterman(a, b, type = "words")
x$b$tokens[x$b$alignment$from:x$b$alignment$to]            
               
# examples on aligning text files
a <- system.file(package = "text.alignment", "extdata", "example1.txt")
a <- readLines(a)
a <- paste(a, collapse = "\\n")
b <- system.file(package = "text.alignment", "extdata", "example2.txt")
b <- readLines(b)
b <- paste(b, collapse = "\\n")
smith_waterman(a, b, type = "characters")
smith_waterman(a, b, type = "words")
smith_waterman("Gistel Hof", b, type = "characters")
smith_waterman("Bailiestraat", b, type = "characters")
smith_waterman("Lange rei", b, type = "characters")

# examples on extracting where elements were found
x <- smith_waterman("Lange rei", b)
x$b$tokens[x$b$alignment$from:x$b$alignment$to]
as.data.frame(x)

x <- lapply(c("Lange rei", "Gistel Hof", NA, "Test"), FUN = function(a){
  x <- smith_waterman(a, b)
  x <- as.data.frame(x)
  x
})
x <- do.call(rbind, x)
x
}
\seealso{
\url{https://en.wikipedia.org/wiki/Smith-Waterman_algorithm}
}
