Skip to content

Commit ae4fc3c

Browse files
committedApr 11, 2024
feat: add entity extractor function
Useful to extract terms from an event description or transcript
1 parent 40f7620 commit ae4fc3c

File tree

1 file changed

+76
-0
lines changed

1 file changed

+76
-0
lines changed
 

‎R/summarization.R

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -854,3 +854,79 @@ infer_agenda_from_transcript <- function(
854854

855855
agenda
856856
}
857+
858+
#' Extract entities from a text
859+
#'
860+
#' This function takes a text and extracts entities from it. The entities can be
861+
#' people, acronyms, organizations, and concepts. The function returns a vector
862+
#' with the entities found in the text. Can be useful to build vocabularies for
863+
#' the LLMs starting from an event description or a transcript.
864+
#'
865+
#' @param text The text from which to extract the entities.
866+
#' @param entities A character vector with the entities to extract. Can be
867+
#' "people", "acronyms", "organizations", and "concepts". Default is all of
868+
#' them.
869+
#' @param prompt_only If TRUE, only the prompt is returned, the LLM is not
870+
#' interrogated. Default is FALSE.
871+
#' @param ... Additional arguments passed to the `interrogate_llm` function.
872+
#'
873+
#' @return A vector with the entities found in the text.
874+
#'
875+
#' @export
876+
#'
877+
entity_extractor <- function(
878+
text,
879+
entities = c("people", "acronyms", "organizations", "concepts"),
880+
prompt_only = FALSE,
881+
...
882+
) {
883+
884+
text <- paste(text, collapse = "--------\n\n\n")
885+
886+
acro_or_concepts <- entities[entities %in% c("acronyms", "concepts")]
887+
888+
task <- paste0(
889+
"You will be passed one or more text documents. For each document, you ",
890+
"should extract the following entities from the text:\n\n",
891+
sprintf("-`%s`;", entities) |> paste(collapse = "\n"),
892+
"\n\nYou should return a JSON object of the entities found in the text, with each ",
893+
"entity type as a key and a list of the entities of that type as the ",
894+
"value. For example, if you find two people and one organization in the ",
895+
"text, you should return a list with two keys, 'people' and 'organizations', ",
896+
"and the corresponding lists of entities as values.\n\n",
897+
if (length(acro_or_concepts) > 0) {
898+
paste0("If you find", paste(acro_or_concepts, collapse = " or "),
899+
"they should be returned list of strings, with each element ",
900+
"formatted as 'entity: definition'",
901+
"trying to infer the definition from the context. ",
902+
"If you are not 100% sure, or it's self explanatory, just list the concepts",
903+
"as strings.\n\n")
904+
},
905+
"Here is an example of the expected output:\n\n",
906+
'```json
907+
{
908+
"people": ["John Doe", "Jane Smith"],
909+
"organizations": ["Acme Corp"],
910+
"acronyms": [
911+
"LLM: Large Language Model",
912+
"NLP: Natural Language Processing"
913+
],
914+
"concepts": [
915+
"Arxiv: Open access repository of scientific articles",
916+
"Escherichia coli"
917+
]
918+
}
919+
```\n\n',
920+
"Here is the text from which you should extract the entities:\n\n####\n\n",
921+
text, "\n\n####\n\nProvide your JSON output below.")
922+
923+
if (prompt_only) {
924+
return(task)
925+
}
926+
927+
interrogate_llm(
928+
c("system" = get_prompts("persona"), "user" = task),
929+
force_json = TRUE, ...) |>
930+
jsonlite::fromJSON() |>
931+
unlist() |> unname()
932+
}

0 commit comments

Comments
 (0)