## ----setup, include=FALSE, cache=FALSE------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
options(width = 1000)
knitr::opts_chunk$set(echo = TRUE, message = FALSE, comment = NA, eval = TRUE)

## -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
library(udpipe)
dl <- udpipe_download_model(language = "dutch")
str(dl)

## -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
## Either give a file in the current working directory
udmodel_dutch <- udpipe_load_model(file = "dutch-ud-2.0-170801.udpipe")
## Or give the full path to the file 
udmodel_dutch <- udpipe_load_model(file = dl$file_model)

## -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
txt <- c("Ik ben de weg kwijt, kunt u me zeggen waar de Lange Wapper ligt? Jazeker meneer", 
         "Het gaat vooruit, het gaat verbazend goed vooruit")
x <- udpipe_annotate(udmodel_dutch, x = txt)
x <- as.data.frame(x)
str(x)
table(x$upos)

## ---- results='hide'------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
## Tokenization + finds sentences, does not execute POS tagging, nor lemmatization or dependency parsing
x <- udpipe_annotate(udmodel_dutch, x = txt, tagger = "none", parser = "none")
x <- as.data.frame(x)
table(x$upos)
table(x$dep_rel)

## Tokenization + finds sentences, does POS tagging and lemmatization but does not execute dependency parsing
x <- udpipe_annotate(udmodel_dutch, x = txt, tagger = "default", parser = "none")
x <- as.data.frame(x)
table(x$upos)
table(x$dep_rel)

## Tokenization + finds sentences and executes dependency parsing but does not do POS tagging nor lemmatization
x <- udpipe_annotate(udmodel_dutch, x = txt, tagger = "none", parser = "default")
x <- as.data.frame(x)
table(x$upos)
table(x$dep_rel)

## ---- results='hide'------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
## Either put every token on a new line and use tokenizer: vertical
input <- list(doc1 = c("Ik", "ben", "de", "weg", "kwijt", ",", "kunt", "u", "me", "zeggen", 
                       "waar", "de", "Lange Wapper", "ligt", "?", "Jazeker", "meneer"),
              doc2 = c("Het", "gaat", "vooruit", ",", "het", "gaat", "verbazend", "goed", "vooruit"))
txt <- sapply(input, FUN=function(x) paste(x, collapse = "\n"))
x <- udpipe_annotate(udmodel_dutch, x = txt, tokenizer = "vertical")
x <- as.data.frame(x)

## Or put every token of each document in 1 string separated by a space and use tokenizer: horizontal
##   Mark that if a token contains a space, you need to replace the space 
##   with the 'NO-BREAK SPACE' (U+00A0) character to make sure it is still considered as one token
txt <- sapply(input, FUN=function(x){
  x <- gsub(" ", intToUtf8(160), x) ## replace space with no-break-space
  paste(x, collapse = " ")
})
x <- udpipe_annotate(udmodel_dutch, x = as.character(txt), tokenizer = "horizontal")
x <- as.data.frame(x)

## -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
dl <- udpipe_download_model(language = "sanskrit")
udmodel_sanskrit <- udpipe_load_model(file = dl$file_model)
txt <- "<U+0924><U+0924><U+0903> <U+0905><U+0938><U+094C> <U+092A><U+094D><U+0930><U+093E><U+0939> <U+0915><U+094D><U+0937><U+0924><U+094D><U+0930><U+093F><U+092F><U+0938><U+094D><U+092F> <U+0924><U+093F><U+0938><U+094D><U+0930><U+0903> <U+092D><U+093E><U+0930><U+094D><U+092F><U+093E> <U+0927><U+0930><U+094D><U+092E><U+092E><U+094D> <U+092D><U+0935><U+0928><U+094D><U+0924><U+093F> <U+0924><U+0924><U+094D> <U+090F><U+0937><U+093E> <U+0915><U+0926><U+093E><U+091A><U+093F><U+0926><U+094D> <U+0935><U+0948><U+0936><U+094D><U+092F><U+093E> <U+0938><U+0941><U+0924><U+093E> <U+092D><U+0935><U+093F><U+0937><U+094D><U+092F><U+0924><U+093F> <U+0924><U+0924><U+094D> <U+0905><U+0928><U+0941><U+0930><U+093E><U+0917><U+0903> <U+092E><U+092E><U+093E><U+0938><U+094D><U+092F><U+093E><U+092E><U+094D> <U+0924><U+0924><U+0903> <U+0930><U+0925><U+0915><U+093E><U+0930><U+0903> <U+0924><U+0938><U+094D><U+092F> <U+0928><U+093F><U+0936><U+094D><U+091A><U+092F><U+092E><U+094D> <U+0935><U+093F><U+091C><U+094D><U+091E><U+093E><U+092F><U+093E><U+0935><U+0926><U+0924><U+094D> <U+0935><U+092F><U+0938><U+094D><U+092F> <U+0915><U+093F><U+092E><U+094D> <U+0905> <U+0927><U+0941><U+0928><U+093E> <U+0915><U+0930><U+094D><U+0924><U+0935><U+094D><U+092F><U+092E><U+094D> <U+0915><U+094C><U+0932><U+093F><U+0915><U+0903> <U+0906><U+0939> <U+0915><U+093F><U+092E><U+094D> <U+0905><U+0939><U+092E><U+094D> <U+091C><U+093E><U+0928><U+093E><U+092E><U+093F> <U+0924><U+094D><U+0935><U+092F><U+093F> <U+092E><U+093F><U+0924><U+094D><U+0930><U+0947> <U+092F><U+0924><U+094D> <U+0905><U+092D><U+093F><U+0939><U+093F><U+0924><U+0902> <U+092E><U+092F><U+093E> <U+0924><U+0924><U+0903>"
x <- udpipe_annotate(udmodel_sanskrit, x = txt)
Encoding(x$conllu)
x <- as.data.frame(x)

## -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
x <- udpipe_annotate(udmodel_sanskrit, x = txt)
cat(x$conllu, file = "myannotation.conllu")

## ---- results='hide', echo=FALSE------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
invisible(file.remove(c("dutch-ud-2.0-170801.udpipe", 
                        "sanskrit-ud-2.0-170801.udpipe",
                        "myannotation.conllu")))

