diff --git a/DESCRIPTION b/DESCRIPTION index 26a0171..33b4080 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -16,6 +16,7 @@ Imports: readr (>= 2.1.4), rlang (>= 1.1.2), stringr (>= 1.5.1), + styler (>= 1.10.2), tools (>= 4.3.2), vctrs (>= 0.6.5) Config/testthat/edition: 3 diff --git a/NAMESPACE b/NAMESPACE index 1ee47d5..3ba4b26 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -2,6 +2,7 @@ export(add_chat_transcript) export(clean_transcript) +export(entity_extractor) export(extract_text_from_transcript) export(format_summary_tree) export(generate_recording_details) diff --git a/R/data_management.R b/R/data_management.R index 2e1fc24..af715db 100644 --- a/R/data_management.R +++ b/R/data_management.R @@ -623,7 +623,10 @@ import_transcript_from_file <- function( as.numeric() # Extract the text - text <- lines[.x + 1] + text <- lines[.x + 1] |> + # MS Teams vtt has an xml tag with speaker info + stringr::str_remove_all("") |> + stringr::str_squish() # Return the data extract <- data.frame( @@ -633,12 +636,23 @@ import_transcript_from_file <- function( ) # If the transcript is diarized, extract the speaker - if (import_diarization && lines[.x - 1] != "") { + if (import_diarization) { # Extract the speaker - cur_speaker <- lines[.x - 1] |> + cur_speaker <- lines[.x - 1] + + # Normal VTT style + if (stringr::str_detect(cur_speaker, "^\\d+ ")) { # the name is between double quotes stringr::str_extract_all('(?<=").*(?=")') |> unlist() + } else if (stringr::str_detect(lines[.x + 1], "^ + cur_speaker <- stringr::str_extract_all( + lines[.x + 1], '(?<=)') |> + unlist() + } else { + cur_speaker <- NA + } # In the unlikely case that there are multiple speakers, join them if (length(cur_speaker) > 1) { @@ -988,7 +1002,7 @@ add_chat_transcript <- function( #' an R file containing such a list. See `summarise_full_meeting` for more #' details. If NULL, the user will be asked if the system should try to #' generate the agenda automatically, using the `infer_agenda_from_transcript` -#' function. +#' function. If FALSE, the agenda will not be used. #' @param expected_agenda A character string. Only used if the `agenda` argument #' is `NULL` and the user requests the automatic agenda generation. this #' string will be used to drive the LLM while generating the agenda. See @@ -1007,7 +1021,7 @@ add_chat_transcript <- function( #' use. See `summarise_full_meeting` for more details. #' @param event_description A string containing a description of the meeting. #' See `summarise_transcript` for more details. -#' @param event_audience A string containing a description of the audience of +#' @param audience A string containing a description of the audience of #' the meeting and what to focus on in the summary. See `summarise_transcript` #' for more details. #' @param vocabulary A character vector of specific vocabulary words, names, @@ -1088,7 +1102,7 @@ speech_to_summary_workflow <- function( extra_agenda_generation_args = NULL, event_description = NULL, - event_audience = "An audience with understanding of the topic", + audience = "An audience with understanding of the topic", vocabulary = NULL, consider_diarization = TRUE, summary_structure = get_prompts("summary_structure"), @@ -1251,9 +1265,21 @@ speech_to_summary_workflow <- function( ## Perform summarization ## + if (length(agenda) > 1) { + stop("The agenda argument should be of length 1.") + } + + # If the agenda argument is a character and the file does not exist, stop the + # process + if (!isFALSE(agenda) && !purrr::is_empty(agenda) && + is.character(agenda) && !file.exists(agenda)) { + stop("The agenda file does not exist. Use agenda = FALSE to not use an + agenda in the creation of the summary, or agenda = NULL to generate + the agenda automatically (may take time).") + } + # Agenda is not provided, ask whether to generate a default agenda - if (purrr::is_empty(agenda) || - (is.character(agenda) && !file.exists(agenda))) { + if (!isFALSE(agenda) && purrr::is_empty(agenda)) { cat("No agenda was provided or found in the target directory.\n") @@ -1267,34 +1293,42 @@ speech_to_summary_workflow <- function( choice <- utils::menu( choices = c( "Generate the agenda automatically (You will need to review it before proceeding)", + "Do not generate an agenda and proceed with one overall summary.", "Exit (write your own agenda)" ), title = "How do you want to proceed?" ) } - if (choice != 1) { + if (choice == 2) { + message("Proceeding without an agenda. We suggest to use the rolling + window summarization method for recordings longer than 1 hour.") + agenda <- FALSE + } else if (choice != 1) { message("Aborted by user. Returning transcript data only (invisibly).") return(invisible(transcript_data)) - } + } else { - # Generate a default agenda with 1 talk/meeting if none is provided - agenda_infer_args <- c(list( - transcript = transcript_data, - event_description = event_description, - vocabulary = vocabulary, - diarization_instructions = extra_diarization_instructions, - start_time = event_start_time, - expected_agenda = expected_agenda, - window_size = agenda_generation_window_size, - output_file = file.path(target_dir, "agenda.R"), - provider = llm_provider - ), extra_agenda_generation_args) - - agenda <- do.call(infer_agenda_from_transcript, agenda_infer_args) - - message("Agenda generated. Please review it before proceeding.") - return(invisible(transcript_data)) + # Generate a default agenda with 1 talk/meeting if none is provided + agenda_infer_args <- c(list( + transcript = transcript_data, + event_description = event_description, + vocabulary = vocabulary, + diarization_instructions = extra_diarization_instructions, + start_time = event_start_time, + expected_agenda = expected_agenda, + window_size = agenda_generation_window_size, + output_file = if (!purrr::is_empty(agenda) && is.character(agenda)) { + file.path(target_dir, basename(agenda)) + } else file.path(target_dir, "agenda.R"), + provider = llm_provider + ), extra_agenda_generation_args) + + agenda <- do.call(infer_agenda_from_transcript, agenda_infer_args) + + message("Agenda generated. Please review it before proceeding.") + return(invisible(transcript_data)) + } } message("\n### Summarizing transcript...\n") @@ -1303,19 +1337,44 @@ speech_to_summary_workflow <- function( stop("No LLM provider defined.") } + # Manage situations where the formatted output file exists + if (!is.null(formatted_output_file) && + isFALSE(overwrite_formatted_output) && + file.exists(formatted_output_file)) { + + if (interactive()) { + choice <- utils::menu( + choices = c( + "Overwrite the existing formatted summary file", + "Abort the process" + ), + title = "The formatted summary output file already exists and overwrite is FALSE. What do you want to do?" + ) + + if (choice == 2) { + message("Aborted by user.") + return(invisible(transcript_data)) + + } else { + message("Overwriting the existing formatted summary file.") + } + } else { + message("The formatted summary output file already exists and overwrite is FALSE.\nSet overwrite_formatted_output = TRUE to overwrite it or remove it.") + return(invisible(transcript_data)) + } + + } + + # Common summarization arguments summarization_args <- c(list( transcript_data = transcript_data, - agenda = agenda, method = summarization_method, window_size = summarization_window_size, output_length = summarization_output_length, - output_file = summarization_output_file, - - event_start_time = event_start_time, event_description = event_description, - event_audience = event_audience, + audience = audience, vocabulary = vocabulary, consider_diarization = consider_diarization, @@ -1323,29 +1382,43 @@ speech_to_summary_workflow <- function( extra_diarization_instructions = extra_diarization_instructions, extra_output_instructions = extra_output_instructions, - provider = llm_provider, - overwrite = overwrite_summary_tree + provider = llm_provider ), extra_summarise_args) + if (isFALSE(agenda)) { + # Summarize as single talk + + formatted_summary <- do.call(summarise_transcript, summarization_args) - summary_tree <- do.call(summarise_full_meeting, summarization_args) + return_vec <- c("transcript_data", "formatted_summary") + + } else { - ## Format summary tree ## + # Summarize as multiple talks - if (overwrite_formatted_output || !file.exists(formatted_output_file)) { + # Necessary extra arguments for the summarization of whole events + summarization_args$agenda <- agenda + summarization_args$overwrite <- overwrite_summary_tree + summarization_args$output_file <- summarization_output_file + summarization_args$event_start_time <- event_start_time + summary_tree <- do.call(summarise_full_meeting, summarization_args) + + ## Format summary tree ## message("\n### Formatting summary tree...\n") formatted_summary <- format_summary_tree( summary_tree = summary_tree, agenda = agenda, event_start_time = event_start_time, - output_file = formatted_output_file) + output_file = NULL) - } else { - message("\n### Loading existing formatted summary...\n") - formatted_summary <- readr::read_file(formatted_output_file) + return_vec <- c("transcript_data", "summary_tree", "formatted_summary") } - mget(c("transcript_data", "summary_tree", "formatted_summary")) + message("\n### Writing to file...\n") + + readr::write_lines(formatted_summary, formatted_output_file) + + return(mget(return_vec)) } diff --git a/R/prompts.R b/R/prompts.R index 3252e75..059c959 100644 --- a/R/prompts.R +++ b/R/prompts.R @@ -646,11 +646,19 @@ The transcript is formatted as a csv with the start and end time of each segment title = "The talk title", type = "A label to define the talk", description = "A description of this talk", - speakers = ["speaker 1", "speaker 2"], - moderators = ["moderator 1"] # If detectable, otherwise ignore this field + speakers = ["a list of speakers"] # If detectable, otherwise ignore this, + moderators = ["a list of moderatora"] # If detectable/appropriate, otherwise ignore this field } ###', + if (!is.null(args$expected_agenda_element)) { + paste("The event expected agenda is the following, so try to match the extracted talk to this structure. But feel free to describe a novel element if you cannot find a logical match, since there could have been unexpected changes in the agenda: ###\n", + args$expected_agenda_element, + "\n###") + }, + + "Provide your output.", + sep = "\n\n" ) } diff --git a/R/summarization.R b/R/summarization.R index 7d2f5dd..507dfb3 100644 --- a/R/summarization.R +++ b/R/summarization.R @@ -316,7 +316,7 @@ summarise_transcript <- function( #' time is not the start time of the event. #' @param event_description The description of the event See #' `summarise_transcript` for more details. -#' @param event_audience The audience of the event See `summarise_transcript` +#' @param audience The audience of the event See `summarise_transcript` #' for more details. #' @param vocabulary The vocabulary used in the meeting. See #' `summarise_transcript` for more details. @@ -346,7 +346,7 @@ summarise_full_meeting <- function( event_start_time = getOption("minutemaker_event_start_time"), event_description = NULL, - event_audience = "An audience with understanding of the topic", + audience = "An audience with understanding of the topic", vocabulary = NULL, consider_diarization = TRUE, @@ -437,7 +437,7 @@ summarise_full_meeting <- function( event_description = event_description, recording_details = recording_details, vocabulary = vocabulary, - audience = event_audience, + audience = audience, consider_diarization = consider_diarization, summary_structure = summary_structure, @@ -458,6 +458,7 @@ summarise_full_meeting <- function( # Update the results file dput(result_tree, file = output_file) + styler::style_file(output_file) } if (length(result_tree) == 0) { @@ -480,6 +481,9 @@ summarise_full_meeting <- function( #' the event. #' @param vocabulary A character vector of specific vocabulary words, names, #' definitions, to help the LLM recognise misspellings and abbreviations. +#' @param diarization_instructions Instructions for the diarization of the +#' transcript. Default is NULL. If provided, it will help the LLM in +#' recognizing the speakers in the transcript. #' @param start_time The start time of the event in the HH:MM(:SS)( AM/PM) #' format. Necessary to convert the agenda times from seconds to an easier to #' read format. @@ -538,11 +542,11 @@ infer_agenda_from_transcript <- function( } transcript_data <- transcript_data |> - select(start, end, text, any_of("speaker")) |> + select("start", "end", "text", any_of("speaker")) |> mutate( - across(c(start, end), ~ round(.x)), + across(all_of(c("start", "end")), ~ round(.x)), ) |> - filter(!is_silent(text)) + filter(!is_silent(.data$text)) breakpoints <- seq( transcript_data$start[1], max(transcript_data$start), by = window_size) @@ -551,8 +555,8 @@ infer_agenda_from_transcript <- function( pauses <- transcript_data |> filter( - start - lag(end, default = 0) > pause_duration - ) |> pull(start) + .data$start - lag(.data$end, default = 0) > pause_duration + ) |> pull("start") breakpoints <- c(breakpoints, pauses) |> sort() @@ -844,6 +848,7 @@ infer_agenda_from_transcript <- function( if (!is.null(output_file)) { dput(agenda, file = output_file) + styler::style_file(output_file) } options( @@ -854,3 +859,79 @@ infer_agenda_from_transcript <- function( agenda } + +#' Extract entities from a text +#' +#' This function takes a text and extracts entities from it. The entities can be +#' people, acronyms, organizations, and concepts. The function returns a vector +#' with the entities found in the text. Can be useful to build vocabularies for +#' the LLMs starting from an event description or a transcript. +#' +#' @param text The text from which to extract the entities. +#' @param entities A character vector with the entities to extract. Can be +#' "people", "acronyms", "organizations", and "concepts". Default is all of +#' them. +#' @param prompt_only If TRUE, only the prompt is returned, the LLM is not +#' interrogated. Default is FALSE. +#' @param ... Additional arguments passed to the `interrogate_llm` function. +#' +#' @return A vector with the entities found in the text. +#' +#' @export +#' +entity_extractor <- function( + text, + entities = c("people", "acronyms", "organizations", "concepts"), + prompt_only = FALSE, + ... + ) { + + text <- paste(text, collapse = "--------\n\n\n") + + acro_or_concepts <- entities[entities %in% c("acronyms", "concepts")] + + task <- paste0( + "You will be passed one or more text documents. For each document, you ", + "should extract the following entities from the text:\n\n", + sprintf("-`%s`;", entities) |> paste(collapse = "\n"), + "\n\nYou should return a JSON object of the entities found in the text, with each ", + "entity type as a key and a list of the entities of that type as the ", + "value. For example, if you find two people and one organization in the ", + "text, you should return a list with two keys, 'people' and 'organizations', ", + "and the corresponding lists of entities as values.\n\n", + if (length(acro_or_concepts) > 0) { + paste0("If you find", paste(acro_or_concepts, collapse = " or "), + "they should be returned list of strings, with each element ", + "formatted as 'entity: definition'", + "trying to infer the definition from the context. ", + "If you are not 100% sure, or it's self explanatory, just list the concepts", + "as strings.\n\n") + }, + "Here is an example of the expected output:\n\n", + '```json + { + "people": ["John Doe", "Jane Smith"], + "organizations": ["Acme Corp"], + "acronyms": [ + "LLM: Large Language Model", + "NLP: Natural Language Processing" + ], + "concepts": [ + "Arxiv: Open access repository of scientific articles", + "Escherichia coli" + ] + } + ```\n\n', + "Here is the text from which you should extract the entities:\n\n####\n\n", + text, "\n\n####\n\nProvide your JSON output below.") + + if (prompt_only) { + return(task) + } + + interrogate_llm( + c("system" = get_prompts("persona"), "user" = task), + force_json = TRUE, ...) |> + jsonlite::fromJSON() |> + unlist() |> unname() +} diff --git a/R/utils.R b/R/utils.R index 0a80f75..c054efb 100644 --- a/R/utils.R +++ b/R/utils.R @@ -23,7 +23,7 @@ check_and_install_dependencies <- function(deps) { if(do_install) { try({ - install.packages(dep) + utils::install.packages(dep) # After successful installation, recheck if the package is now installed is_installed <- requireNamespace(dep, quietly = FALSE) }) diff --git a/man/entity_extractor.Rd b/man/entity_extractor.Rd new file mode 100644 index 0000000..bc7f51b --- /dev/null +++ b/man/entity_extractor.Rd @@ -0,0 +1,34 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/summarization.R +\name{entity_extractor} +\alias{entity_extractor} +\title{Extract entities from a text} +\usage{ +entity_extractor( + text, + entities = c("people", "acronyms", "organizations", "concepts"), + prompt_only = FALSE, + ... +) +} +\arguments{ +\item{text}{The text from which to extract the entities.} + +\item{entities}{A character vector with the entities to extract. Can be +"people", "acronyms", "organizations", and "concepts". Default is all of +them.} + +\item{prompt_only}{If TRUE, only the prompt is returned, the LLM is not +interrogated. Default is FALSE.} + +\item{...}{Additional arguments passed to the \code{interrogate_llm} function.} +} +\value{ +A vector with the entities found in the text. +} +\description{ +This function takes a text and extracts entities from it. The entities can be +people, acronyms, organizations, and concepts. The function returns a vector +with the entities found in the text. Can be useful to build vocabularies for +the LLMs starting from an event description or a transcript. +} diff --git a/man/infer_agenda_from_transcript.Rd b/man/infer_agenda_from_transcript.Rd index b0f51e3..9c54c77 100644 --- a/man/infer_agenda_from_transcript.Rd +++ b/man/infer_agenda_from_transcript.Rd @@ -26,6 +26,10 @@ the event.} \item{vocabulary}{A character vector of specific vocabulary words, names, definitions, to help the LLM recognise misspellings and abbreviations.} +\item{diarization_instructions}{Instructions for the diarization of the +transcript. Default is NULL. If provided, it will help the LLM in +recognizing the speakers in the transcript.} + \item{start_time}{The start time of the event in the HH:MM(:SS)( AM/PM) format. Necessary to convert the agenda times from seconds to an easier to read format.} diff --git a/man/speech_to_summary_workflow.Rd b/man/speech_to_summary_workflow.Rd index ba827bd..5449836 100644 --- a/man/speech_to_summary_workflow.Rd +++ b/man/speech_to_summary_workflow.Rd @@ -28,11 +28,11 @@ speech_to_summary_workflow( chat_format = "webex", agenda = file.path(target_dir, "agenda.R"), expected_agenda = NULL, - agenda_generation_window_size = 7200, + agenda_generation_window_size = 3600, agenda_generation_output_file = file.path(target_dir, "agenda.R"), extra_agenda_generation_args = NULL, event_description = NULL, - event_audience = "An audience with understanding of the topic", + audience = "An audience with understanding of the topic", vocabulary = NULL, consider_diarization = TRUE, summary_structure = get_prompts("summary_structure"), @@ -133,7 +133,7 @@ talk, talk description and start and end times. Alternatively, the path to an R file containing such a list. See \code{summarise_full_meeting} for more details. If NULL, the user will be asked if the system should try to generate the agenda automatically, using the \code{infer_agenda_from_transcript} -function.} +function. If FALSE, the agenda will not be used.} \item{expected_agenda}{A character string. Only used if the \code{agenda} argument is \code{NULL} and the user requests the automatic agenda generation. this @@ -156,7 +156,7 @@ function will be taken from the \code{extra_agenda_generation_args} argument.} \item{event_description}{A string containing a description of the meeting. See \code{summarise_transcript} for more details.} -\item{event_audience}{A string containing a description of the audience of +\item{audience}{A string containing a description of the audience of the meeting and what to focus on in the summary. See \code{summarise_transcript} for more details.} diff --git a/man/summarise_full_meeting.Rd b/man/summarise_full_meeting.Rd index 9028af7..fd89f6b 100644 --- a/man/summarise_full_meeting.Rd +++ b/man/summarise_full_meeting.Rd @@ -13,7 +13,7 @@ summarise_full_meeting( output_length = 3, event_start_time = getOption("minutemaker_event_start_time"), event_description = NULL, - event_audience = "An audience with understanding of the topic", + audience = "An audience with understanding of the topic", vocabulary = NULL, consider_diarization = TRUE, summary_structure = get_prompts("summary_structure"), @@ -52,7 +52,7 @@ time is not the start time of the event.} \item{event_description}{The description of the event See \code{summarise_transcript} for more details.} -\item{event_audience}{The audience of the event See \code{summarise_transcript} +\item{audience}{The audience of the event See \code{summarise_transcript} for more details.} \item{vocabulary}{The vocabulary used in the meeting. See diff --git a/renv.lock b/renv.lock index 1bd4e45..65c869a 100644 --- a/renv.lock +++ b/renv.lock @@ -9,6 +9,60 @@ ] }, "Packages": { + "R.cache": { + "Package": "R.cache", + "Version": "0.16.0", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "R.methodsS3", + "R.oo", + "R.utils", + "digest", + "utils" + ], + "Hash": "fe539ca3f8efb7410c3ae2cf5fe6c0f8" + }, + "R.methodsS3": { + "Package": "R.methodsS3", + "Version": "1.8.2", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "utils" + ], + "Hash": "278c286fd6e9e75d0c2e8f731ea445c8" + }, + "R.oo": { + "Package": "R.oo", + "Version": "1.26.0", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "R.methodsS3", + "methods", + "utils" + ], + "Hash": "4fed809e53ddb5407b3da3d0f572e591" + }, + "R.utils": { + "Package": "R.utils", + "Version": "2.12.3", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "R.methodsS3", + "R.oo", + "methods", + "tools", + "utils" + ], + "Hash": "3dc2829b790254bfba21e60965787651" + }, "R6": { "Package": "R6", "Version": "2.5.1", @@ -106,6 +160,17 @@ ], "Hash": "411ca2c03b1ce5f548345d2fc2685f7a" }, + "digest": { + "Package": "digest", + "Version": "0.6.35", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "utils" + ], + "Hash": "698ece7ba5a4fa4559e3d537e7ec3d31" + }, "dplyr": { "Package": "dplyr", "Version": "1.1.4", @@ -368,6 +433,16 @@ ], "Hash": "42548638fae05fd9a9b5f3f437fbbbe2" }, + "rprojroot": { + "Package": "rprojroot", + "Version": "2.0.4", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R" + ], + "Hash": "4c8415e0ec1e29f3f4f6fc108bef0144" + }, "stringi": { "Package": "stringi", "Version": "1.8.3", @@ -398,6 +473,25 @@ ], "Hash": "960e2ae9e09656611e0b8214ad543207" }, + "styler": { + "Package": "styler", + "Version": "1.10.3", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "R.cache", + "cli", + "magrittr", + "purrr", + "rlang", + "rprojroot", + "tools", + "vctrs", + "withr" + ], + "Hash": "93a2b1beac2437bdcc4724f8bf867e2c" + }, "sys": { "Package": "sys", "Version": "3.4.2",