diff --git a/inst/doc/monkeylearn_intro.R b/inst/doc/monkeylearn_intro.R
new file mode 100644
index 0000000..f8083b2
--- /dev/null
+++ b/inst/doc/monkeylearn_intro.R
@@ -0,0 +1,146 @@
+## ---- echo = FALSE, warning=FALSE, message=FALSE-------------------------
+NOT_CRAN <- identical(tolower(Sys.getenv("NOT_CRAN")), "true")
+pat <- Sys.getenv("MONKEYLEARN_KEY")
+IS_THERE_KEY <- (pat != "")
+NOT_CRAN <- ifelse(IS_THERE_KEY, NOT_CRAN, FALSE)
+knitr::opts_chunk$set(
+ collapse = TRUE,
+ comment = "#>",
+ purl = NOT_CRAN,
+ eval = NOT_CRAN
+)
+
+## ---- message = FALSE----------------------------------------------------
+library(monkeylearn)
+library(magrittr)
+
+text <- "In the 19th century, the major European powers had gone to great lengths to maintain a balance of power throughout Europe, resulting in the existence of a complex network of political and military alliances throughout the continent by 1900.[7] These had started in 1815, with the Holy Alliance between Prussia, Russia, and Austria. Then, in October 1873, German Chancellor Otto von Bismarck negotiated the League of the Three Emperors (German: Dreikaiserbund) between the monarchs of Austria-Hungary, Russia and Germany."
+output <- monkey_extract(input = text,
+ extractor_id = "ex_isnnZRbS")
+output
+attr(output, "headers")
+
+## ------------------------------------------------------------------------
+text <- "A panel of Goldman Sachs employees spent a recent Tuesday night at the
+Columbia University faculty club trying to convince a packed room of potential
+recruits that Wall Street, not Silicon Valley, was the place to be for computer
+scientists.\n\n The Goldman employees knew they had an uphill battle. They were
+fighting against perceptions of Wall Street as boring and regulation-bound and
+Silicon Valley as the promised land of flip-flops, beanbag chairs and million-dollar
+stock options.\n\n Their argument to the room of technologically inclined students
+was that Wall Street was where they could find far more challenging, diverse and,
+yes, lucrative jobs working on some of the worlds most difficult technical problems."
+
+output <- monkey_extract(text,
+ extractor_id = "ex_y7BPYzNG",
+ params = list(max_keywords = 3))
+output
+output2 <- monkey_extract(text,
+ extractor_id = "ex_y7BPYzNG",
+ params = list(max_keywords = 1))
+output2
+attr(output2, "headers")
+
+## ---- message = FALSE----------------------------------------------------
+text <- "A panel of Goldman Sachs employees spent a recent Tuesday night at the Columbia University faculty club trying to convince a packed room of potential recruits that Wall Street, not Silicon Valley, was the place to be for computer scientists.
+
+The Goldman employees knew they had an uphill battle. They were fighting against perceptions of Wall Street as boring and regulation-bound and Silicon Valley as the promised land of flip-flops, beanbag chairs and million-dollar stock options.
+
+Their argument to the room of technologically inclined students was that Wall Street was where they could find far more challenging, diverse and, yes, lucrative jobs working on some of the world’s most difficult technical problems.
+
+“Whereas in other opportunities you might be considering, it is working one type of data or one type of application, we deal in hundreds of products in hundreds of markets, with thousands or tens of thousands of clients, every day, millions of times of day worldwide,” Afsheen Afshar, a managing director at Goldman Sachs, told the students."
+
+monkey_extract(text, extractor_id = "ex_y7BPYzNG")
+
+## ---- message = FALSE----------------------------------------------------
+text <- "Hi, my email is john@example.com and my credit card is 4242-4242-4242-4242 so you can charge me with $10. My phone number is 15555 9876. We can get in touch on April 16, at 10:00am"
+text2 <- "Hi, my email is mary@example.com and my credit card is 4242-4232-4242-4242. My phone number is 16655 9876. We can get in touch on April 16, at 10:00am"
+
+monkey_extract(c(text, text2), extractor_id = "ex_dqRio5sG", unnest = TRUE)
+
+## ---- message = FALSE----------------------------------------------------
+text1 <- "my dog is an avid rice eater"
+text2 <- "i want to buy an iphone"
+request <- c(text1, text2)
+
+monkey_classify(request, classifier_id = "cl_oFKL5wft")
+
+## ------------------------------------------------------------------------
+monkeylearn_classifiers(private = FALSE)
+
+## ---- message = FALSE----------------------------------------------------
+text1 <- "Haurŕs de dirigir-te al punt de trobada del grup al que et vulguis unir."
+text2 <- "i want to buy an iphone"
+text3 <- "Je déteste ne plus avoir de dentifrice."
+request <- c(text1, text2, text3)
+
+monkey_classify(request, classifier_id = "cl_oJNMkt2V")
+
+## ---- message = FALSE----------------------------------------------------
+text1 <- "I think this is awesome."
+text2 <- "Holy shit! You did great!"
+request <- c(text1, text2)
+
+monkey_classify(request, classifier_id = "cl_KFXhoTdt")
+
+## ---- message = FALSE----------------------------------------------------
+text1 <- "Let me tell you about my dog and my cat. They are really friendly and like going on walks. They both like chasing mice."
+text2 <- "My first R package was probably a disaster but I keep learning how to program."
+request <- c(text1, text2)
+monkey_classify(request, classifier_id = "cl_5icAVzKR")
+
+
+## ----monkey_input--------------------------------------------------------
+input <- c("Emma Woodhouse, handsome, clever, and rich, with a comfortable home",
+ "and happy disposition, seemed to unite some of the best blessings of",
+ "existence; and had lived nearly twenty-one years in the world with very",
+ "little to distress or vex her.",
+ "", # <--- note the empty string!
+ "She was the youngest of the two daughters of a most affectionate,",
+ "indulgent father; and had, in consequence of her sister's marriage, been",
+ "mistress of his house from a very early period. Her mother had died",
+ "too long ago for her to have more than an indistinct remembrance of",
+ "her caresses; and her place had been supplied by an excellent woman as",
+ "governess, who had fallen little short of a mother in affection.")
+
+## ----monkey_output-------------------------------------------------------
+(output <- monkey_classify(input, unnest = FALSE))
+
+## ----very_empty_input----------------------------------------------------
+(very_empty_input <- rep("", 25) %>% c(input) %>% sample())
+
+## ------------------------------------------------------------------------
+monkey_classify(very_empty_input, unnest = FALSE)
+
+## ------------------------------------------------------------------------
+output$res
+
+## ----unnest_true---------------------------------------------------------
+(output_unnested <- monkey_classify(input, verbose = FALSE, unnest = TRUE))
+
+## ----compare_df----------------------------------------------------------
+input_df <- tibble::tibble(text = input)
+output_df_unnested <- monkey_classify(input_df, text, unnest = TRUE, verbose = FALSE) %>%
+ dplyr::rename(req = text)
+
+testthat::expect_equal(output_unnested, output_df_unnested)
+
+## ----keep_all------------------------------------------------------------
+sw <- dplyr::starwars %>%
+ dplyr::select(name, height) %>%
+ dplyr::sample_n(nrow(input_df))
+
+sw_input_df <- input_df %>%
+ dplyr::bind_cols(sw)
+
+sw_input_df %>% monkey_classify(text, unnest = FALSE, verbose = FALSE)
+
+## ----one_by_one, warning=FALSE-------------------------------------------
+one_by_one <- system.time(output <- monkey_classify(input, texts_per_req = 1))
+
+## ----batch_of_five, warning=FALSE----------------------------------------
+batch_of_five <- system.time(output <- monkey_classify(input, texts_per_req = 5))
+
+## ----speedup-------------------------------------------------------------
+(speedup <- one_by_one[1] / batch_of_five[1])
+
diff --git a/inst/doc/monkeylearn_intro.Rmd b/inst/doc/monkeylearn_intro.Rmd
new file mode 100644
index 0000000..84ebfc3
--- /dev/null
+++ b/inst/doc/monkeylearn_intro.Rmd
@@ -0,0 +1,323 @@
+---
+title: "monkeylearn, a R Package for Natural Language Processing Using Monkeylearn Existing Modules"
+author: "M. Salmon, A. Dobbyn"
+date: "`r Sys.Date()`"
+output: rmarkdown::html_vignette
+vignette: >
+ %\VignetteIndexEntry{intro}
+ %\VignetteEngine{knitr::rmarkdown}
+ %\VignetteEncoding{UTF-8}
+---
+
+```{r, echo = FALSE, warning=FALSE, message=FALSE}
+NOT_CRAN <- identical(tolower(Sys.getenv("NOT_CRAN")), "true")
+pat <- Sys.getenv("MONKEYLEARN_KEY")
+IS_THERE_KEY <- (pat != "")
+NOT_CRAN <- ifelse(IS_THERE_KEY, NOT_CRAN, FALSE)
+knitr::opts_chunk$set(
+ collapse = TRUE,
+ comment = "#>",
+ purl = NOT_CRAN,
+ eval = NOT_CRAN
+)
+```
+
+# Intro
+
+This package is an interface to the [MonkeyLearn API](http://docs.monkeylearn.com/article/api-reference/). MonkeyLearn is a Machine Learning platform on the cloud that allows software companies and developers to easily extract actionable data from text.
+
+The goal of the package is not to support machine learning algorithms development with R or the API, but only to *reap the benefits of the existing modules on Monkeylearn*. Therefore, there are only two functions, one for using *extractors*, and one for using *classifiers*. The difference between extractors and classifiers is that extractors output information about words, whereas classifiers output information about each text as a whole. Named entity recognition is an extraction task, whereas assigning a topic to a text is a classification task.
+
+## Setup
+
+To get an API key for MonkeyLearn, register at http://monkeylearn.com/. Note that MonkeyLearn supports registration through GitHub, which makes the registration process really easy. For ease of use, save your API key as an environment variable as described at http://stat545.com/bit003_api-key-env-var.html. You might also want to use the `usethis::edit_r_environ()` function to modify .Renviron.
+
+All functions of the package will conveniently look for your API key using `Sys.getenv("MONKEYLEARN_KEY")` so if your API key is an environment variable called "MONKEYLEARN\_KEY" you don't need to input it manually.
+
+Please also create a "MONKEYLEARN\_PLAN" environment variable indicating whether your [Monkeylearn plan](https://app.monkeylearn.com/main/my-account/tab/change-plan/) is "free", "team", "business" or "custom". If you do not indicate it by default it will be "free" with a message. If your plan is "custom" you'll need a third environment variable "MONKEYLEARN\_RATE" indicating the maximum amount of requests per minute that you can make to the API. If you do not indicate it, by default it will be 120 with a message.
+
+## So many monkeys/functions
+
+The packages exports `monkeylearn_classify`, `monkey_classify`, `monkeylearn_extract`, `monkey_extract`. The `monkey_` functions are the newer and better ones, so if you don't have legacy code, just start using those!
+
+For inspiration beyond this vignette, you can see external examples of the package in action [on this page](http://ropensci.github.io/monkeylearn/). In particular you'll find examples using the older set of functions but we now recommend using `monkey_extract` and `monkey_classify`, see more later in the vignette.
+
+# Extract
+
+## A first example
+
+```{r, message = FALSE}
+library(monkeylearn)
+library(magrittr)
+
+text <- "In the 19th century, the major European powers had gone to great lengths to maintain a balance of power throughout Europe, resulting in the existence of a complex network of political and military alliances throughout the continent by 1900.[7] These had started in 1815, with the Holy Alliance between Prussia, Russia, and Austria. Then, in October 1873, German Chancellor Otto von Bismarck negotiated the League of the Three Emperors (German: Dreikaiserbund) between the monarchs of Austria-Hungary, Russia and Germany."
+output <- monkey_extract(input = text,
+ extractor_id = "ex_isnnZRbS")
+output
+attr(output, "headers")
+```
+
+## Parameters
+
+If the documentation of the extractor you use states it has parameters, you can pass them as a named list, see below.
+
+```{r}
+text <- "A panel of Goldman Sachs employees spent a recent Tuesday night at the
+Columbia University faculty club trying to convince a packed room of potential
+recruits that Wall Street, not Silicon Valley, was the place to be for computer
+scientists.\n\n The Goldman employees knew they had an uphill battle. They were
+fighting against perceptions of Wall Street as boring and regulation-bound and
+Silicon Valley as the promised land of flip-flops, beanbag chairs and million-dollar
+stock options.\n\n Their argument to the room of technologically inclined students
+was that Wall Street was where they could find far more challenging, diverse and,
+yes, lucrative jobs working on some of the worlds most difficult technical problems."
+
+output <- monkey_extract(text,
+ extractor_id = "ex_y7BPYzNG",
+ params = list(max_keywords = 3))
+output
+output2 <- monkey_extract(text,
+ extractor_id = "ex_y7BPYzNG",
+ params = list(max_keywords = 1))
+output2
+attr(output2, "headers")
+```
+
+## How to find extractors?
+
+You can find extractors and their IDs, including extractors for text in Spanish, at https://app.monkeylearn.com/main/explore
+
+There is no endpoint for automatically finding all extractors, but if you find one in the website you particularly like and use a lot in your language and application, you could choose to save its id as an environment variable as explained [here]( http://stat545.com/bit003_api-key-env-var.html). Reading about extractors on the website will give you a good overview of their characteristics and original application.
+
+Here are a few ones for text in English:
+
+* [Entity extractor](https://app.monkeylearn.com/extraction/extractors/ex_isnnZRbS/tab/description-tab), `extractor_id = "ex_isnnZRbS"` (used in the first example). Extract Entities from text using Named Entity Recognition (NER). NER labels sequences of words in a text which are the names of things, such as person and company names. This implementation labels 3 classes: PERSON, ORGANIZATION and LOCATION. This NER tagger is implemented using Conditional Random Field (CRF) sequence models.
+
+* [Keyword extractor](https://app.monkeylearn.com/extraction/extractors/ex_y7BPYzNG/tab/description-tab), `extractor_id = "ex_y7BPYzNG"`. Extract keywords from text in English. Keywords can be compounded by one or more words and are defined as the important topics in your content and can be used to index data, generate tag clouds or for searching. This keyword extraction algorithm employs statistical algorithms and natural language processing technology to analyze your content and identify the relevant keywords.
+
+```{r, message = FALSE}
+text <- "A panel of Goldman Sachs employees spent a recent Tuesday night at the Columbia University faculty club trying to convince a packed room of potential recruits that Wall Street, not Silicon Valley, was the place to be for computer scientists.
+
+The Goldman employees knew they had an uphill battle. They were fighting against perceptions of Wall Street as boring and regulation-bound and Silicon Valley as the promised land of flip-flops, beanbag chairs and million-dollar stock options.
+
+Their argument to the room of technologically inclined students was that Wall Street was where they could find far more challenging, diverse and, yes, lucrative jobs working on some of the world’s most difficult technical problems.
+
+“Whereas in other opportunities you might be considering, it is working one type of data or one type of application, we deal in hundreds of products in hundreds of markets, with thousands or tens of thousands of clients, every day, millions of times of day worldwide,” Afsheen Afshar, a managing director at Goldman Sachs, told the students."
+
+monkey_extract(text, extractor_id = "ex_y7BPYzNG")
+```
+
+* [Useful data extractor](https://app.monkeylearn.com/extraction/extractors/ex_dqRio5sG/tab/description-tab), `extractor_id = "ex_dqRio5sG"`. Extract useful data from text. This algorithm can be used to detect many different useful data: links, phones, ips, prices, times, emails, bitcoin addresses, dates, ipv6s, hex colors and credit cards.
+
+When using this extractor, the format of the API output is a bit different than for other extractors, see below how the output looks like.
+
+```{r, message = FALSE}
+text <- "Hi, my email is john@example.com and my credit card is 4242-4242-4242-4242 so you can charge me with $10. My phone number is 15555 9876. We can get in touch on April 16, at 10:00am"
+text2 <- "Hi, my email is mary@example.com and my credit card is 4242-4232-4242-4242. My phone number is 16655 9876. We can get in touch on April 16, at 10:00am"
+
+monkey_extract(c(text, text2), extractor_id = "ex_dqRio5sG", unnest = TRUE)
+```
+
+
+# Classify
+
+## A first example
+
+```{r, message = FALSE}
+text1 <- "my dog is an avid rice eater"
+text2 <- "i want to buy an iphone"
+request <- c(text1, text2)
+
+monkey_classify(request, classifier_id = "cl_oFKL5wft")
+```
+## How to find classifiers?
+
+You can find classifiers and their IDs at https://app.monkeylearn.com/main/explore or you can use the `monkeylearn_classifiers` function, choosing to show all classifiers or only the private ones with `private = TRUE`. The first column of the resulting data.frame is the `classifier_id` to be used in `monkeylearn_classify`.
+
+```{r}
+monkeylearn_classifiers(private = FALSE)
+```
+
+Here are a few other examples:
+
+* [Language detection](https://app.monkeylearn.com/categorizer/projects/cl_oJNMkt2V/tab/main-tab), `classifier_id = "cl_oJNMkt2V"`. Detect language in text. New languages were added for a total of 48 different languages arranged in language families.
+
+```{r, message = FALSE}
+text1 <- "HaurĂ s de dirigir-te al punt de trobada del grup al que et vulguis unir."
+text2 <- "i want to buy an iphone"
+text3 <- "Je déteste ne plus avoir de dentifrice."
+request <- c(text1, text2, text3)
+
+monkey_classify(request, classifier_id = "cl_oJNMkt2V")
+```
+
+* [Profanity and abuse detection](https://app.monkeylearn.com/categorizer/projects/cl_KFXhoTdt/tab/main-tab), `classifier_id = "cl_KFXhoTdt"`.
+
+```{r, message = FALSE}
+text1 <- "I think this is awesome."
+text2 <- "Holy shit! You did great!"
+request <- c(text1, text2)
+
+monkey_classify(request, classifier_id = "cl_KFXhoTdt")
+```
+
+* [General topic classifier](https://app.monkeylearn.com/categorizer/projects/cl_5icAVzKR/tab/), `classifier_id = "cl_5icAVzKR"`.
+
+```{r, message = FALSE}
+text1 <- "Let me tell you about my dog and my cat. They are really friendly and like going on walks. They both like chasing mice."
+text2 <- "My first R package was probably a disaster but I keep learning how to program."
+request <- c(text1, text2)
+monkey_classify(request, classifier_id = "cl_5icAVzKR")
+
+```
+
+
+# Get what you paid for
+
+Monkeylearn offers a different service based on your current plan, that is, "free", "team" or "business". These plans will both influence your _rate limiting_ (how fast?) and your _query limiting_ (how many queries?). See https://monkeylearn.com/pricing/. Thanks to your MONKEYLEARN_PLAN environment variable, the rate will be handled automatically thanks to [`ratelimitr`](https://github.com/tarakc02/ratelimitr).
+
+## Check the number of remaining calls
+
+After each call to a function you can check how many calls to the API you can still make using `attr(output, "headers")$x.query.limit.remaining` and `attr(output, "headers")$x.query.limit.limit`. The period after which `attr(output, "headers")$x.query.limit.remaining` depends on your subscription and is not included in the output.
+
+
+
+# Fit `monkeylearn` into your pipeline!
+
+You can:
+
+* Send a vector of texts *or* a dataframe and a named column (unquoted)
+* Output either a nested or unnested dataframe
+ * Nested = 1 row per input; unnested = 1 row per output
+* This output
+ * Relates each input text to its (usually) multiple classifications/extractions
+ * Retains a record of inputs that could not be classified/extracted (e.g., empty strings)
+* Batch requests
+
+
+## In a bit more detail
+
+You can classify or extract a vector or dataframe of texts while relating the original input text to its classifications. This is important, because the input:output relationship may not always (and in fact, is not usually) 1:1. These functions retain the tie between each `input`[^1] element and all of its output elements.
+
+```{r monkey_input}
+input <- c("Emma Woodhouse, handsome, clever, and rich, with a comfortable home",
+ "and happy disposition, seemed to unite some of the best blessings of",
+ "existence; and had lived nearly twenty-one years in the world with very",
+ "little to distress or vex her.",
+ "", # <--- note the empty string!
+ "She was the youngest of the two daughters of a most affectionate,",
+ "indulgent father; and had, in consequence of her sister's marriage, been",
+ "mistress of his house from a very early period. Her mother had died",
+ "too long ago for her to have more than an indistinct remembrance of",
+ "her caresses; and her place had been supplied by an excellent woman as",
+ "governess, who had fallen little short of a mother in affection.")
+```
+
+That is true even if you have inputs that cannot be processed. For instance, empty string and `NA` input elements are not sent to the API for classification/extraction. (You'll get a warning of this if `verbose = TRUE`.) We've got one above to illustrate and elements that returned no classifications/extractions are included in the resulting dataframe. This way you'll know which inputs could not be processed.
+
+```{r monkey_output}
+(output <- monkey_classify(input, unnest = FALSE))
+```
+
+
+
+If there are more than 20 empty inputs, we save your console by messaging only the first 20 indices.
+
+```{r very_empty_input}
+(very_empty_input <- rep("", 25) %>% c(input) %>% sample())
+```
+
+
+Since the entire original input is represented in the output, if you need to find all of the empty inputs you can easily filter the output to all of the rows containing empty strings.
+```{r}
+monkey_classify(very_empty_input, unnest = FALSE)
+```
+
+
+### Configuring the Output
+
+The default output is a nested dataframe with the same number of rows as your input dataframe or the same length as your input vector, depending on which one you sent in.
+
+Let's take a look at the `res` output column.
+```{r}
+output$res
+```
+
+You can easily choose an unnested output by setting the **unnest flag** to TRUE (which it is by default) to get one row per classification/extraction.
+
+```{r unnest_true}
+(output_unnested <- monkey_classify(input, verbose = FALSE, unnest = TRUE))
+```
+
+We could have gotten the same result by sending in a dataframe and a named column. If a dataframe is supplied input column is not renamed to `req` as it is when input is a vector; the original column name is retained.
+
+```{r compare_df}
+input_df <- tibble::tibble(text = input)
+output_df_unnested <- monkey_classify(input_df, text, unnest = TRUE, verbose = FALSE) %>%
+ dplyr::rename(req = text)
+
+testthat::expect_equal(output_unnested, output_df_unnested)
+```
+
+
+
+If the input is a dataframe, setting the `.keep_all` option to TRUE allows you to retain all input columns. If FALSE, only the column you specify for classification will be retained.
+
+```{r keep_all}
+sw <- dplyr::starwars %>%
+ dplyr::select(name, height) %>%
+ dplyr::sample_n(nrow(input_df))
+
+sw_input_df <- input_df %>%
+ dplyr::bind_cols(sw)
+
+sw_input_df %>% monkey_classify(text, unnest = FALSE, verbose = FALSE)
+```
+
+
+### Batching
+
+Retaining the relationship between input and output doesn't mean you'll need to send requests one-by-one. **Batch requests** by setting the `texts_per_req` value which governs the number of texts that are sent per request. Per the [MonkeyLearn documentation](http://help.monkeylearn.com/frequently-asked-questions/queries/can-i-classify-or-extract-more-than-one-text-with-one-api-request), the maximum we recommend sending at once is 200 requests.
+
+If `texts_per_req` is NULL, the default, we try to optimize the response time from the API by setting `texts_per_req` to 200 when your input has more than 200 texts or to the length of the `input` if you've got fewer. You'll see a significant speedup by batching your requests this way. However, batching doesn't save you on queries; a batch of 150 texts still uses up 150 queries.
+
+These functions also include some more verbose **progress reporting**, letting you know what batch you're on out of the total, and which texts are set to be processed in that batch.
+
+```{r one_by_one, warning=FALSE}
+one_by_one <- system.time(output <- monkey_classify(input, texts_per_req = 1))
+```
+
+```{r batch_of_five, warning=FALSE}
+batch_of_five <- system.time(output <- monkey_classify(input, texts_per_req = 5))
+```
+
+How much does sending 5 texts in a batch vs. 1 text improve our processing time?
+```{r speedup}
+(speedup <- one_by_one[1] / batch_of_five[1])
+```
+
+
+A 3-4x speedup isn't so bad! Worth keeping in mind that if you need the blazing fast speeds you might consider upgrading to a higher MonkeyLearn price tier.
+
+
+
+
+
+
+
+***
+
+
+
+
+# Meta
+
+* Please [report any issues or bugs](https://github.com/ropensci/monkeylearn/issues).
+* License: GPL
+* Get citation information for `monkeylearn` in R doing `citation(package = 'monkeylearn')`
+* Please note that this project is released with a [Contributor Code of Conduct](CONDUCT.md). By participating in this project you agree to abide by its terms.
+* This package is part of the [rOpenSci project](https://ropensci.org/).
+
+
+[^1]: Thanks to [Julia Silge](https://juliasilge.com/)'s fantastic [`janeaustenr`](https://github.com/juliasilge/janeaustenr) package for this text!
+
diff --git a/inst/doc/monkeylearn_intro.html b/inst/doc/monkeylearn_intro.html
new file mode 100644
index 0000000..0bc3366
--- /dev/null
+++ b/inst/doc/monkeylearn_intro.html
@@ -0,0 +1,833 @@
+
+
+
+
+
This package is an interface to the MonkeyLearn API. MonkeyLearn is a Machine Learning platform on the cloud that allows software companies and developers to easily extract actionable data from text.
+The goal of the package is not to support machine learning algorithms development with R or the API, but only to reap the benefits of the existing modules on Monkeylearn. Therefore, there are only two functions, one for using extractors, and one for using classifiers. The difference between extractors and classifiers is that extractors output information about words, whereas classifiers output information about each text as a whole. Named entity recognition is an extraction task, whereas assigning a topic to a text is a classification task.
+To get an API key for MonkeyLearn, register at http://monkeylearn.com/. Note that MonkeyLearn supports registration through GitHub, which makes the registration process really easy. For ease of use, save your API key as an environment variable as described at http://stat545.com/bit003_api-key-env-var.html. You might also want to use the usethis::edit_r_environ()
function to modify .Renviron.
All functions of the package will conveniently look for your API key using Sys.getenv("MONKEYLEARN_KEY")
so if your API key is an environment variable called “MONKEYLEARN_KEY” you don’t need to input it manually.
Please also create a “MONKEYLEARN_PLAN” environment variable indicating whether your Monkeylearn plan is “free”, “team”, “business” or “custom”. If you do not indicate it by default it will be “free” with a message. If your plan is “custom” you’ll need a third environment variable “MONKEYLEARN_RATE” indicating the maximum amount of requests per minute that you can make to the API. If you do not indicate it, by default it will be 120 with a message.
+The packages exports monkeylearn_classify
, monkey_classify
, monkeylearn_extract
, monkey_extract
. The monkey_
functions are the newer and better ones, so if you don’t have legacy code, just start using those!
For inspiration beyond this vignette, you can see external examples of the package in action on this page. In particular you’ll find examples using the older set of functions but we now recommend using monkey_extract
and monkey_classify
, see more later in the vignette.
library(monkeylearn)
+library(magrittr)
+
+text <- "In the 19th century, the major European powers had gone to great lengths to maintain a balance of power throughout Europe, resulting in the existence of a complex network of political and military alliances throughout the continent by 1900.[7] These had started in 1815, with the Holy Alliance between Prussia, Russia, and Austria. Then, in October 1873, German Chancellor Otto von Bismarck negotiated the League of the Three Emperors (German: Dreikaiserbund) between the monarchs of Austria-Hungary, Russia and Germany."
+output <- monkey_extract(input = text,
+ extractor_id = "ex_isnnZRbS")
+output
+#> # A tibble: 7 x 4
+#> req count tag entity
+#> <chr> <int> <chr> <chr>
+#> 1 In the 19th century, the major European powers had ~ 1 LOCAT~ Europe
+#> 2 In the 19th century, the major European powers had ~ 1 LOCAT~ Pruss~
+#> 3 In the 19th century, the major European powers had ~ 1 LOCAT~ Austr~
+#> 4 In the 19th century, the major European powers had ~ 1 LOCAT~ Austr~
+#> 5 In the 19th century, the major European powers had ~ 1 LOCAT~ Germa~
+#> 6 In the 19th century, the major European powers had ~ 1 PERSON Otto ~
+#> 7 In the 19th century, the major European powers had ~ 2 LOCAT~ Russia
+attr(output, "headers")
+#> # A tibble: 1 x 11
+#> server date content.type transfer.encodi~ connection x.query.limit.l~
+#> <chr> <chr> <chr> <chr> <chr> <chr>
+#> 1 nginx/~ Wed, ~ application~ chunked keep-alive 5000000
+#> # ... with 5 more variables: x.query.limit.remaining <chr>,
+#> # x.query.limit.request.queries <chr>, allow <chr>,
+#> # content.encoding <chr>, text_md5 <chr>
If the documentation of the extractor you use states it has parameters, you can pass them as a named list, see below.
+text <- "A panel of Goldman Sachs employees spent a recent Tuesday night at the
+Columbia University faculty club trying to convince a packed room of potential
+recruits that Wall Street, not Silicon Valley, was the place to be for computer
+scientists.\n\n The Goldman employees knew they had an uphill battle. They were
+fighting against perceptions of Wall Street as boring and regulation-bound and
+Silicon Valley as the promised land of flip-flops, beanbag chairs and million-dollar
+stock options.\n\n Their argument to the room of technologically inclined students
+was that Wall Street was where they could find far more challenging, diverse and,
+yes, lucrative jobs working on some of the worlds most difficult technical problems."
+
+output <- monkey_extract(text,
+ extractor_id = "ex_y7BPYzNG",
+ params = list(max_keywords = 3))
+#> Processing batch 1 of 1 batches: texts 1 to 1
+output
+#> # A tibble: 3 x 5
+#> req count relevance positions_in_te~ keyword
+#> <chr> <int> <chr> <list> <chr>
+#> 1 "A panel of Goldman Sachs empl~ 3 0.978 <int [3]> Wall S~
+#> 2 "A panel of Goldman Sachs empl~ 2 0.652 <int [2]> Silico~
+#> 3 "A panel of Goldman Sachs empl~ 1 0.543 <int [1]> millio~
+output2 <- monkey_extract(text,
+ extractor_id = "ex_y7BPYzNG",
+ params = list(max_keywords = 1))
+#> Processing batch 1 of 1 batches: texts 1 to 1
+output2
+#> # A tibble: 1 x 5
+#> req count relevance positions_in_te~ keyword
+#> <chr> <int> <chr> <list> <chr>
+#> 1 "A panel of Goldman Sachs empl~ 3 0.978 <int [3]> Wall S~
+attr(output2, "headers")
+#> # A tibble: 1 x 11
+#> server date content.type transfer.encodi~ connection x.query.limit.l~
+#> <chr> <chr> <chr> <chr> <chr> <chr>
+#> 1 nginx/~ Wed, ~ application~ chunked keep-alive 5000000
+#> # ... with 5 more variables: x.query.limit.remaining <chr>,
+#> # x.query.limit.request.queries <chr>, allow <chr>,
+#> # content.encoding <chr>, text_md5 <chr>
You can find extractors and their IDs, including extractors for text in Spanish, at https://app.monkeylearn.com/main/explore
+There is no endpoint for automatically finding all extractors, but if you find one in the website you particularly like and use a lot in your language and application, you could choose to save its id as an environment variable as explained here. Reading about extractors on the website will give you a good overview of their characteristics and original application.
+Here are a few ones for text in English:
+Entity extractor, extractor_id = "ex_isnnZRbS"
(used in the first example). Extract Entities from text using Named Entity Recognition (NER). NER labels sequences of words in a text which are the names of things, such as person and company names. This implementation labels 3 classes: PERSON, ORGANIZATION and LOCATION. This NER tagger is implemented using Conditional Random Field (CRF) sequence models.
Keyword extractor, extractor_id = "ex_y7BPYzNG"
. Extract keywords from text in English. Keywords can be compounded by one or more words and are defined as the important topics in your content and can be used to index data, generate tag clouds or for searching. This keyword extraction algorithm employs statistical algorithms and natural language processing technology to analyze your content and identify the relevant keywords.
text <- "A panel of Goldman Sachs employees spent a recent Tuesday night at the Columbia University faculty club trying to convince a packed room of potential recruits that Wall Street, not Silicon Valley, was the place to be for computer scientists.
+
+The Goldman employees knew they had an uphill battle. They were fighting against perceptions of Wall Street as boring and regulation-bound and Silicon Valley as the promised land of flip-flops, beanbag chairs and million-dollar stock options.
+
+Their argument to the room of technologically inclined students was that Wall Street was where they could find far more challenging, diverse and, yes, lucrative jobs working on some of the world’s most difficult technical problems.
+
+“Whereas in other opportunities you might be considering, it is working one type of data or one type of application, we deal in hundreds of products in hundreds of markets, with thousands or tens of thousands of clients, every day, millions of times of day worldwide,” Afsheen Afshar, a managing director at Goldman Sachs, told the students."
+
+monkey_extract(text, extractor_id = "ex_y7BPYzNG")
+#> # A tibble: 10 x 5
+#> req count relevance positions_in_te~ keyword
+#> <chr> <int> <chr> <list> <chr>
+#> 1 "A panel of Goldman Sachs emp~ 3 0.978 <int [3]> Wall S~
+#> 2 "A panel of Goldman Sachs emp~ 2 0.652 <int [2]> Silico~
+#> 3 "A panel of Goldman Sachs emp~ 1 0.543 <int [1]> millio~
+#> 4 "A panel of Goldman Sachs emp~ 1 0.543 <int [1]> Goldma~
+#> 5 "A panel of Goldman Sachs emp~ 1 0.543 <int [1]> Univer~
+#> 6 "A panel of Goldman Sachs emp~ 1 0.543 <int [1]> recent~
+#> 7 "A panel of Goldman Sachs emp~ 1 0.543 <int [1]> diffic~
+#> 8 "A panel of Goldman Sachs emp~ 2 0.435 <int [2]> thousa~
+#> 9 "A panel of Goldman Sachs emp~ 2 0.435 <int [2]> type
+#> 10 "A panel of Goldman Sachs emp~ 2 0.435 <int [2]> hundre~
extractor_id = "ex_dqRio5sG"
. Extract useful data from text. This algorithm can be used to detect many different useful data: links, phones, ips, prices, times, emails, bitcoin addresses, dates, ipv6s, hex colors and credit cards.When using this extractor, the format of the API output is a bit different than for other extractors, see below how the output looks like.
+text <- "Hi, my email is john@example.com and my credit card is 4242-4242-4242-4242 so you can charge me with $10. My phone number is 15555 9876. We can get in touch on April 16, at 10:00am"
+text2 <- "Hi, my email is mary@example.com and my credit card is 4242-4232-4242-4242. My phone number is 16655 9876. We can get in touch on April 16, at 10:00am"
+
+monkey_extract(c(text, text2), extractor_id = "ex_dqRio5sG", unnest = TRUE)
+#> # A tibble: 2 x 12
+#> req dates links phones ipv6s hex_colors ips credit_cards prices
+#> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
+#> 1 Hi, my em~ Apri~ exam~ 15555~ <NA> <NA> <NA> 4242-4242-4~ $10
+#> 2 Hi, my em~ Apri~ exam~ 16655~ <NA> <NA> <NA> 4242-4232-4~ <NA>
+#> # ... with 3 more variables: times <chr>, emails <chr>,
+#> # bitcoin_addresses <chr>
text1 <- "my dog is an avid rice eater"
+text2 <- "i want to buy an iphone"
+request <- c(text1, text2)
+
+monkey_classify(request, classifier_id = "cl_oFKL5wft")
+#> # A tibble: 6 x 4
+#> req category_id probability label
+#> <chr> <int> <dbl> <chr>
+#> 1 my dog is an avid rice eater 18313097 0.130 Pets
+#> 2 my dog is an avid rice eater 18313108 0.239 Dogs
+#> 3 my dog is an avid rice eater 18313113 0.0820 Dog Food
+#> 4 i want to buy an iphone 18314739 0.113 Cell Phones
+#> 5 i want to buy an iphone 18314740 0.186 Accessories
+#> 6 i want to buy an iphone 18314741 0.0940 Cases & Protectors
You can find classifiers and their IDs at https://app.monkeylearn.com/main/explore or you can use the monkeylearn_classifiers
function, choosing to show all classifiers or only the private ones with private = TRUE
. The first column of the resulting data.frame is the classifier_id
to be used in monkeylearn_classify
.
monkeylearn_classifiers(private = FALSE)
+#> # A tibble: 45 x 19
+#> classifier_id name description train_state train_job_id language
+#> <chr> <chr> <chr> <chr> <lgl> <chr>
+#> 1 cl_Aiu8dfYF Custo~ "Classifies cus~ TRAINED NA en
+#> 2 cl_gBhyCMCs Custo~ Detects if huma~ TRAINED NA en
+#> 3 cl_N3VzxNLN Custo~ Differentiates ~ TRAINED NA en
+#> 4 cl_csyzaevo Role ~ "Classifies job~ DIRTY NA en
+#> 5 cl_pX4g5EEF Custo~ Helps to detect~ TRAINED NA en
+#> 6 cl_Jx8qzYJh Senti~ Detect sentimen~ TRAINED NA en
+#> 7 cl_sGdE8hD9 NPS S~ Classify NPS re~ DIRTY NA en
+#> 8 cl_T7XMb74S IAB T~ Classifies text~ TRAINED NA en
+#> 9 cl_nLW3yR6m Telco~ Classify custom~ TRAINED NA en
+#> 10 cl_4LqLD7cN Telco~ Detect the topi~ TRAINED NA en
+#> # ... with 35 more rows, and 13 more variables: ngram_range <chr>,
+#> # use_stemmer <lgl>, stop_words <chr>, max_features <int>,
+#> # strip_stopwords <lgl>, is_multilabel <lgl>, is_twitter_data <lgl>,
+#> # normalize_weights <lgl>, classifier <chr>, industry <chr>,
+#> # classifier_type <chr>, text_type <chr>, permissions <chr>
Here are a few other examples:
+classifier_id = "cl_oJNMkt2V"
. Detect language in text. New languages were added for a total of 48 different languages arranged in language families.text1 <- "HaurĂ s de dirigir-te al punt de trobada del grup al que et vulguis unir."
+text2 <- "i want to buy an iphone"
+text3 <- "Je déteste ne plus avoir de dentifrice."
+request <- c(text1, text2, text3)
+
+monkey_classify(request, classifier_id = "cl_oJNMkt2V")
+#> # A tibble: 6 x 4
+#> req category_id probability label
+#> <chr> <int> <dbl> <chr>
+#> 1 HaurĂ s de dirigir-te al punt de trobada~ 53876647 1.00 Italic
+#> 2 HaurĂ s de dirigir-te al punt de trobada~ 53876648 0.642 Catala~
+#> 3 i want to buy an iphone 53876636 0.384 German~
+#> 4 i want to buy an iphone 53876639 0.613 Englis~
+#> 5 Je déteste ne plus avoir de dentifrice. 53876647 0.995 Italic
+#> 6 Je déteste ne plus avoir de dentifrice. 53876649 0.887 French~
classifier_id = "cl_KFXhoTdt"
.text1 <- "I think this is awesome."
+text2 <- "Holy shit! You did great!"
+request <- c(text1, text2)
+
+monkey_classify(request, classifier_id = "cl_KFXhoTdt")
+#> # A tibble: 2 x 4
+#> req category_id probability label
+#> <chr> <int> <dbl> <chr>
+#> 1 I think this is awesome. 22375077 0.803 clean
+#> 2 Holy shit! You did great! 22375076 0.997 profanity
classifier_id = "cl_5icAVzKR"
.text1 <- "Let me tell you about my dog and my cat. They are really friendly and like going on walks. They both like chasing mice."
+text2 <- "My first R package was probably a disaster but I keep learning how to program."
+request <- c(text1, text2)
+monkey_classify(request, classifier_id = "cl_5icAVzKR")
+#> # A tibble: 5 x 4
+#> req category_id probability label
+#> <chr> <int> <dbl> <chr>
+#> 1 Let me tell you about my dog and my c~ 64600 0.894 Animals
+#> 2 Let me tell you about my dog and my c~ 64608 0.649 Mammals
+#> 3 Let me tell you about my dog and my c~ 64611 0.869 Land Mam~
+#> 4 My first R package was probably a dis~ 64638 0.240 Computer~
+#> 5 My first R package was probably a dis~ 64640 0.252 Internet
Monkeylearn offers a different service based on your current plan, that is, “free”, “team” or “business”. These plans will both influence your rate limiting (how fast?) and your query limiting (how many queries?). See https://monkeylearn.com/pricing/. Thanks to your MONKEYLEARN_PLAN environment variable, the rate will be handled automatically thanks to ratelimitr
.
After each call to a function you can check how many calls to the API you can still make using attr(output, "headers")$x.query.limit.remaining
and attr(output, "headers")$x.query.limit.limit
. The period after which attr(output, "headers")$x.query.limit.remaining
depends on your subscription and is not included in the output.
monkeylearn
into your pipeline!You can:
+You can classify or extract a vector or dataframe of texts while relating the original input text to its classifications. This is important, because the input:output relationship may not always (and in fact, is not usually) 1:1. These functions retain the tie between each input
1 element and all of its output elements.
input <- c("Emma Woodhouse, handsome, clever, and rich, with a comfortable home",
+ "and happy disposition, seemed to unite some of the best blessings of",
+ "existence; and had lived nearly twenty-one years in the world with very",
+ "little to distress or vex her.",
+ "", # <--- note the empty string!
+ "She was the youngest of the two daughters of a most affectionate,",
+ "indulgent father; and had, in consequence of her sister's marriage, been",
+ "mistress of his house from a very early period. Her mother had died",
+ "too long ago for her to have more than an indistinct remembrance of",
+ "her caresses; and her place had been supplied by an excellent woman as",
+ "governess, who had fallen little short of a mother in affection.")
That is true even if you have inputs that cannot be processed. For instance, empty string and NA
input elements are not sent to the API for classification/extraction. (You’ll get a warning of this if verbose = TRUE
.) We’ve got one above to illustrate and elements that returned no classifications/extractions are included in the resulting dataframe. This way you’ll know which inputs could not be processed.
(output <- monkey_classify(input, unnest = FALSE))
+#> Using classifier ID cl_oFKL5wft; to find other classifiers, run monkeylearn_classifiers() or visit https://app.monkeylearn.com/main/explore/
+#> The following indices were empty strings and could not be sent to the API: 5. They will still be included in the output.
+#> Processing batch 1 of 1 batches: texts 1 to 10
+#> # A tibble: 11 x 2
+#> req res
+#> <chr> <list>
+#> 1 Emma Woodhouse, handsome, clever, and rich, with a com~ <data.frame [4~
+#> 2 and happy disposition, seemed to unite some of the bes~ <data.frame [3~
+#> 3 existence; and had lived nearly twenty-one years in th~ <data.frame [3~
+#> 4 little to distress or vex her. <data.frame [4~
+#> 5 "" <data.frame [1~
+#> 6 She was the youngest of the two daughters of a most af~ <data.frame [4~
+#> 7 indulgent father; and had, in consequence of her siste~ <data.frame [3~
+#> 8 mistress of his house from a very early period. Her mo~ <data.frame [3~
+#> 9 too long ago for her to have more than an indistinct r~ <data.frame [4~
+#> 10 her caresses; and her place had been supplied by an ex~ <data.frame [3~
+#> 11 governess, who had fallen little short of a mother in ~ <data.frame [4~
If there are more than 20 empty inputs, we save your console by messaging only the first 20 indices.
+(very_empty_input <- rep("", 25) %>% c(input) %>% sample())
+#> [1] ""
+#> [2] ""
+#> [3] ""
+#> [4] ""
+#> [5] "little to distress or vex her."
+#> [6] ""
+#> [7] "existence; and had lived nearly twenty-one years in the world with very"
+#> [8] ""
+#> [9] ""
+#> [10] ""
+#> [11] "too long ago for her to have more than an indistinct remembrance of"
+#> [12] "governess, who had fallen little short of a mother in affection."
+#> [13] ""
+#> [14] "and happy disposition, seemed to unite some of the best blessings of"
+#> [15] "She was the youngest of the two daughters of a most affectionate,"
+#> [16] ""
+#> [17] ""
+#> [18] "mistress of his house from a very early period. Her mother had died"
+#> [19] ""
+#> [20] ""
+#> [21] ""
+#> [22] "indulgent father; and had, in consequence of her sister's marriage, been"
+#> [23] ""
+#> [24] ""
+#> [25] ""
+#> [26] ""
+#> [27] ""
+#> [28] ""
+#> [29] "her caresses; and her place had been supplied by an excellent woman as"
+#> [30] ""
+#> [31] ""
+#> [32] ""
+#> [33] ""
+#> [34] "Emma Woodhouse, handsome, clever, and rich, with a comfortable home"
+#> [35] ""
+#> [36] ""
Since the entire original input is represented in the output, if you need to find all of the empty inputs you can easily filter the output to all of the rows containing empty strings.
+monkey_classify(very_empty_input, unnest = FALSE)
+#> Using classifier ID cl_oFKL5wft; to find other classifiers, run monkeylearn_classifiers() or visit https://app.monkeylearn.com/main/explore/
+#> The following indices were empty strings and could not be sent to the API. (Displaying first 20): 1, 2, 3, 4, 6, 8, 9, 10, 13, 16, 17, 19, 20, 21, 23, 24, 25, 26, 27, 28... They will still be included in the output.
+#> Processing batch 1 of 1 batches: texts 1 to 10
+#> # A tibble: 36 x 2
+#> req res
+#> <chr> <list>
+#> 1 "" <data.frame [1 ~
+#> 2 "" <data.frame [1 ~
+#> 3 "" <data.frame [1 ~
+#> 4 "" <data.frame [1 ~
+#> 5 little to distress or vex her. <data.frame [4 ~
+#> 6 "" <data.frame [1 ~
+#> 7 existence; and had lived nearly twenty-one years in t~ <data.frame [3 ~
+#> 8 "" <data.frame [1 ~
+#> 9 "" <data.frame [1 ~
+#> 10 "" <data.frame [1 ~
+#> # ... with 26 more rows
The default output is a nested dataframe with the same number of rows as your input dataframe or the same length as your input vector, depending on which one you sent in.
+Let’s take a look at the res
output column.
output$res
+#> [[1]]
+#> category_id probability label
+#> 1 18313280 0.058 Music
+#> 2 18313356 0.052 Pop
+#> 3 18313357 0.338 Pop Rock
+#> 4 18313358 0.209 Joni Mitchell
+#>
+#> [[2]]
+#> category_id probability label
+#> 1 18313280 0.088 Music
+#> 2 18313701 0.051 Special Interest
+#> 3 18313702 0.816 See all Special Interest
+#>
+#> [[3]]
+#> category_id probability label
+#> 1 18314767 0.076 Books
+#> 2 18315016 0.049 Religion
+#> 3 18315024 0.139 Inspirational
+#>
+#> [[4]]
+#> category_id probability label
+#> 1 18313280 0.060 Music
+#> 2 18313601 0.057 Blues
+#> 3 18313612 0.262 Chicago Blues
+#> 4 18313614 0.227 Little Milton
+#>
+#> [[5]]
+#> category_id probability label
+#> 1 NA NA NA
+#>
+#> [[6]]
+#> category_id probability label
+#> 1 18313280 0.059 Music
+#> 2 18313459 0.058 World
+#> 3 18313469 0.352 Celtic
+#> 4 18313471 0.440 Celtic Man
+#>
+#> [[7]]
+#> category_id probability label
+#> 1 18314767 0.075 Books
+#> 2 18314983 0.057 Parenting & Families
+#> 3 18314999 0.057 Marriage
+#>
+#> [[8]]
+#> category_id probability label
+#> 1 18314767 0.078 Books
+#> 2 18314769 0.046 Children's Books
+#> 3 18314788 0.044 Educational
+#>
+#> [[9]]
+#> category_id probability label
+#> 1 18313280 0.061 Music
+#> 2 18313584 0.063 Children's Music
+#> 3 18313589 0.383 Children's Sing-a-Long
+#> 4 18313590 0.285 Various Artists-Sing-A-Longs
+#>
+#> [[10]]
+#> category_id probability label
+#> 1 18314767 0.058 Books
+#> 2 18314839 0.045 Biography & Memoirs
+#> 3 18314851 0.082 Religious
+#>
+#> [[11]]
+#> category_id probability label
+#> 1 18314767 0.068 Books
+#> 2 18314769 0.049 Children's Books
+#> 3 18314784 0.042 Animals
+#> 4 18314785 0.504 Rabbits
You can easily choose an unnested output by setting the unnest flag to TRUE (which it is by default) to get one row per classification/extraction.
+(output_unnested <- monkey_classify(input, verbose = FALSE, unnest = TRUE))
+#> # A tibble: 36 x 4
+#> req category_id probability label
+#> <chr> <int> <dbl> <chr>
+#> 1 Emma Woodhouse, handsome, clever~ 18313280 0.0580 Music
+#> 2 Emma Woodhouse, handsome, clever~ 18313356 0.0520 Pop
+#> 3 Emma Woodhouse, handsome, clever~ 18313357 0.338 Pop Rock
+#> 4 Emma Woodhouse, handsome, clever~ 18313358 0.209 Joni Mitchell
+#> 5 and happy disposition, seemed to~ 18313280 0.0880 Music
+#> 6 and happy disposition, seemed to~ 18313701 0.0510 Special Inte~
+#> 7 and happy disposition, seemed to~ 18313702 0.816 See all Spec~
+#> 8 existence; and had lived nearly ~ 18314767 0.0760 Books
+#> 9 existence; and had lived nearly ~ 18315016 0.0490 Religion
+#> 10 existence; and had lived nearly ~ 18315024 0.139 Inspirational
+#> # ... with 26 more rows
We could have gotten the same result by sending in a dataframe and a named column. If a dataframe is supplied input column is not renamed to req
as it is when input is a vector; the original column name is retained.
input_df <- tibble::tibble(text = input)
+output_df_unnested <- monkey_classify(input_df, text, unnest = TRUE, verbose = FALSE) %>%
+ dplyr::rename(req = text)
+
+testthat::expect_equal(output_unnested, output_df_unnested)
If the input is a dataframe, setting the .keep_all
option to TRUE allows you to retain all input columns. If FALSE, only the column you specify for classification will be retained.
sw <- dplyr::starwars %>%
+ dplyr::select(name, height) %>%
+ dplyr::sample_n(nrow(input_df))
+
+sw_input_df <- input_df %>%
+ dplyr::bind_cols(sw)
+
+sw_input_df %>% monkey_classify(text, unnest = FALSE, verbose = FALSE)
+#> # A tibble: 11 x 4
+#> name height text res
+#> <chr> <int> <chr> <list>
+#> 1 Jek Tono Porkins 180 Emma Woodhouse, handsome, clever,~ <data.fram~
+#> 2 Lobot 175 and happy disposition, seemed to ~ <data.fram~
+#> 3 Gregar Typho 185 existence; and had lived nearly t~ <data.fram~
+#> 4 Jocasta Nu 167 little to distress or vex her. <data.fram~
+#> 5 Poe Dameron NA "" <data.fram~
+#> 6 Luminara Unduli 170 She was the youngest of the two d~ <data.fram~
+#> 7 Owen Lars 178 indulgent father; and had, in con~ <data.fram~
+#> 8 Taun We 213 mistress of his house from a very~ <data.fram~
+#> 9 C-3PO 167 too long ago for her to have more~ <data.fram~
+#> 10 Biggs Darklighter 183 her caresses; and her place had b~ <data.fram~
+#> 11 Boba Fett 183 governess, who had fallen little ~ <data.fram~
Retaining the relationship between input and output doesn’t mean you’ll need to send requests one-by-one. Batch requests by setting the texts_per_req
value which governs the number of texts that are sent per request. Per the MonkeyLearn documentation, the maximum we recommend sending at once is 200 requests.
If texts_per_req
is NULL, the default, we try to optimize the response time from the API by setting texts_per_req
to 200 when your input has more than 200 texts or to the length of the input
if you’ve got fewer. You’ll see a significant speedup by batching your requests this way. However, batching doesn’t save you on queries; a batch of 150 texts still uses up 150 queries.
These functions also include some more verbose progress reporting, letting you know what batch you’re on out of the total, and which texts are set to be processed in that batch.
+one_by_one <- system.time(output <- monkey_classify(input, texts_per_req = 1))
+#> Using classifier ID cl_oFKL5wft; to find other classifiers, run monkeylearn_classifiers() or visit https://app.monkeylearn.com/main/explore/
+#> The following indices were empty strings and could not be sent to the API: 5. They will still be included in the output.
+#> Processing batch 1 of 10 batches: texts 1 to 1
+#> Processing batch 2 of 10 batches: texts 1 to 2
+#> Processing batch 3 of 10 batches: texts 2 to 3
+#> Processing batch 4 of 10 batches: texts 3 to 4
+#> Processing batch 5 of 10 batches: texts 4 to 5
+#> Processing batch 6 of 10 batches: texts 5 to 6
+#> Processing batch 7 of 10 batches: texts 6 to 7
+#> Processing batch 8 of 10 batches: texts 7 to 8
+#> Processing batch 9 of 10 batches: texts 8 to 9
+#> Processing batch 10 of 10 batches: texts 9 to 10
+#>
+#>
+#> -------------
+#> Still working!
+#> --------------
+#> \
+#> \
+#> \
+#>
+#> .="=.
+#> _/.-.-.\_ _
+#> ( ( o o ) ) ))
+#> |/ " \| //
+#> \'---'/ //
+#> jgs /`"""`\\ ((
+#> / /_,_\ \\ \\
+#> \_\_'__/ \ ))
+#> /` /`~\ |//
+#> / / \ /
+#> ,--`,--'\/\ /
+#> '-- "--' '--'
batch_of_five <- system.time(output <- monkey_classify(input, texts_per_req = 5))
+#> Using classifier ID cl_oFKL5wft; to find other classifiers, run monkeylearn_classifiers() or visit https://app.monkeylearn.com/main/explore/
+#> The following indices were empty strings and could not be sent to the API: 5. They will still be included in the output.
+#> Processing batch 1 of 2 batches: texts 1 to 5
+#> Processing batch 2 of 2 batches: texts 5 to 10
How much does sending 5 texts in a batch vs. 1 text improve our processing time?
+ +A 3-4x speedup isn’t so bad! Worth keeping in mind that if you need the blazing fast speeds you might consider upgrading to a higher MonkeyLearn price tier.
+monkeylearn
in R doing citation(package = 'monkeylearn')
Thanks to Julia Silge’s fantastic janeaustenr
package for this text!↩