% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/dataset_imdb.R
\name{dataset_imdb}
\alias{dataset_imdb}
\title{IMDB Large Movie Review Dataset}
\source{
\url{http://ai.stanford.edu/~amaas/data/sentiment/}
}
\usage{
dataset_imdb(dir = NULL, split = c("train", "test"), delete = FALSE,
  return_path = FALSE)
}
\arguments{
\item{dir}{Character, path to directory where data will be stored. If
\code{NULL}, \link[rappdirs]{user_cache_dir} will be used to determine path.}

\item{split}{Character. Return training ("train") data or testing ("test")
data. Defaults to "train".}

\item{delete}{Logical, set \code{TRUE} to delete dataset.}

\item{return_path}{Logical, set \code{TRUE} to return the path of the dataset.}
}
\value{
A tibble with 25,000 rows and 2 variables:
\describe{
  \item{Sentiment}{Character, denoting the sentiment}
  \item{text}{Character, text of the review}
}
}
\description{
The core dataset contains 50,000 reviews split evenly into 25k train and
25k test sets. The overall distribution of labels is balanced (25k pos and
25k neg).
}
\details{
In the entire collection, no more than 30 reviews are allowed for any
given movie because reviews for the same movie tend to have correlated
ratings. Further, the train and test sets contain a disjoint set of
movies, so no significant performance is obtained by memorizing
movie-unique terms and their associated with observed labels. In the
labeled train/test sets, a negative review has a score <= 4 out of 10,
and a positive review has a score >= 7 out of 10. Thus reviews with
more neutral ratings are not included in the train/test sets. In the
unsupervised set, reviews of any rating are included and there are an
even number of reviews > 5 and <= 5.

When using this dataset, please cite the ACL 2011 paper

InProceedings\{maas-EtAl:2011:ACL-HLT2011, \cr
author    = \{Maas, Andrew L.  and  Daly, Raymond E.  and  Pham, Peter T.  and  Huang, Dan  and  Ng, Andrew Y.  and  Potts, Christopher\}, \cr
title     = \{Learning Word Vectors for Sentiment Analysis\}, \cr
booktitle = \{Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies\}, \cr
month     = \{June\}, \cr
year      = \{2011\}, \cr
address   = \{Portland, Oregon, USA\}, \cr
publisher = \{Association for Computational Linguistics\}, \cr
pages     = \{142--150\}, \cr
url       = \{http://www.aclweb.org/anthology/P11-1015\}
\}
}
\examples{
\donttest{
dataset_imdb()

# Custom directory
dataset_imdb(dir = "data/")

# Deleting dataset
dataset_imdb(delete = TRUE)

# Returning filepath of data
dataset_imdb(return_path = TRUE)

# Access both training and testing dataset
train <- dataset_imdb(split = "train")
test <- dataset_imdb(split = "test")
}

}
\concept{topic sentiment}
\keyword{datasets}
