diff --git a/R/data_management.R b/R/data_management.R index 7336d28..cfc22d4 100644 --- a/R/data_management.R +++ b/R/data_management.R @@ -984,8 +984,25 @@ add_chat_transcript <- function( #' generate the chat file. See `add_chat_transcript` for more details. #' @param agenda The agenda of the meeting, that is, a list of agenda elements #' each with a session name, a title, speaker and moderator lists, type of -#' talk and start and end times. Alternatively, the path to an R file -#' containing such a list. See `summarise_full_meeting` for more details. +#' talk, talk description and start and end times. Alternatively, the path to +#' an R file containing such a list. See `summarise_full_meeting` for more +#' details. If NULL, the user will be asked if the system should try to +#' generate the agenda automatically, using the `infer_agenda_from_transcript` +#' function. +#' @param expected_agenda A character string. Only used if the `agenda` argument +#' is `NULL` and the user requests the automatic agenda generation. this +#' string will be used to drive the LLM while generating the agenda. See +#' `infer_agenda_from_transcript` for more details. +#' @param agenda_generation_window_size The size of the window in seconds to +#' analyze at once when generating the agenda. See +#' `infer_agenda_from_transcript` for more details. +#' @param agenda_generation_output_file A string with the path to the output +#' file where the automatically generated agenda will be written. Should be a +#' .R file. See `infer_agenda_from_transcript` for more details. +#' @param extra_agenda_generation_args Additional arguments passed to the +#' `infer_agenda_from_transcript` function. See `infer_agenda_from_transcript` +#' for more details. Note that the `diarization_instructions` argument for this +#' function will be taken from the `extra_agenda_generation_args` argument. #' @param summarization_method A string indicating the summarization method to #' use. See `summarise_full_meeting` for more details. #' @param event_description A string containing a description of the meeting. @@ -1000,9 +1017,9 @@ add_chat_transcript <- function( #' should take into account the diarization of the transcript. See #' `summarise_transcript` for more details. #' @param summary_structure,extra_diarization_instructions,extra_output_instructions -#' Specific instructions necessary to build the summarisation prompt. See -#' `summarise_transcript` for more details and run `get_prompts()` to see the -#' defaults. See `summarise_transcript` for more details. +#' Specific instructions necessary to build the summarisation prompt. See +#' `summarise_transcript` for more details and run `get_prompts()` to see the +#' defaults. See `summarise_transcript` for more details. #' @param llm_provider A string indicating the LLM provider to use for the #' summarization. See `summarise_transcript` for more details. #' @param extra_summarise_args Additional arguments passed to the @@ -1063,8 +1080,12 @@ speech_to_summary_workflow <- function( , full.names = T)[1], chat_format = "webex", - # Arguments for `summarise_full_meeting` + # Arguments for `summarise_full_meeting` and `infer_agenda_from_transcript` agenda = file.path(target_dir, "agenda.R"), + expected_agenda = NULL, + agenda_generation_window_size = 7200, + agenda_generation_output_file = file.path(target_dir, "agenda.R"), + extra_agenda_generation_args = NULL, event_description = NULL, event_audience = "An audience with understanding of the topic", @@ -1245,7 +1266,7 @@ speech_to_summary_workflow <- function( } else { choice <- utils::menu( choices = c( - "Generate a default agenda (i.e., process the transcript as one talk)", + "Generate the agenda automatically (You will need to review it before proceeding)", "Exit (write your own agenda)" ), title = "How do you want to proceed?" @@ -1258,12 +1279,22 @@ speech_to_summary_workflow <- function( } # Generate a default agenda with 1 talk/meeting if none is provided - agenda <- list( - list( - from = min(transcript_data$start), - to = max(transcript_data$end) - ) - ) + agenda_infer_args <- c(list( + transcript = transcript_data, + event_description = event_description, + vocabulary = vocabulary, + diarization_instructions = extra_diarization_instructions, + start_time = event_start_time, + expected_agenda = expected_agenda, + window_size = agenda_generation_window_size, + output_file = file.path(target_dir, "agenda.R"), + provider = llm_provider + ), extra_agenda_generation_args) + + agenda <- do.call(infer_agenda_from_transcript, agenda_infer_args) + + message("Agenda generated. Please review it before proceeding.") + return(invisible(transcript_data)) } message("\n### Summarizing transcript...\n") diff --git a/R/prompts.R b/R/prompts.R index 652b648..f17ead3 100644 --- a/R/prompts.R +++ b/R/prompts.R @@ -74,6 +74,29 @@ set_prompts <- function( base_task = "Your task is to provide a summary of the transcript segments that will be given to you.", aggregate_task_rolling = "Your task is to aggregate the summaries generated from the segments of a meeting/talk into a single, comprehensive text, without loss of information.", + agenda_inference_task = collapse( + "Your task is to extract individual talks from this transcript, creating an agenda. You can identify them from a change of speaker, and or, a change of topic. Try to detect broad changes of topics so to avoid splitting the transcript into an excessively large number of small talks; a talk usually last at least 10-15 minutes to one hour, so join into one talk very short change of topics, even if the speaker change. Aggregate both the talk itself and possible Q&A sessions in the same talk.", + "You wil be FIRST producing a step by step reasoning of what could be a good subdivision of the transcript into different talks, considering different competing subdivisions, and THEN suggest the agenda. Take speakers, topics, and timings into consideration in your reasoning.", + "Your output will be a JSON object with two components: your reasoning and the actual agenda. The agenda must be an array of \"talk\" objects, each with the talk title, a short description (1 sentence), a label with the type of talk (e.g. welcome talk, conference outline, conference talk, meeting discussion, Q&A session, etc...), an array with one of more speakers, another array with moderators (if detectable) and the starting and end time in seconds. Add also the \"session\" object if it make sense as grouping.", + "Here's an example of the output structure:", + "### + { + reasoning = \"Your reasoning goes here\", + agenda = [ + { + title = \"The talk title\", + type = \"Conference talk\", + description = \"A description of this talk\", + speakers = [\"speaker 1\", \"speaker 2\"], + moderators = [\"moderator 1\"] # If detectable, otherwise ignore this field + from = 1231, to = 2023 + }, + {...}, /* another talk element */ + ... + ] + } + ###", + "Important: process the whole transcript, do not be lazy: your agenda should cover the entirety of the transcript."), event_description_template = collapse( "The following is a description of the event in which the talk/meeting took place, which will provide you with context.", @@ -93,18 +116,15 @@ set_prompts <- function( "", "{transcript}", "" ), - # transcript_template_one_shot = collapse( - # "Here is the transcript segment you need to summarise:", - # "", "{transcript}", "" - # ), - # - # transcript_template_rolling = collapse( - # "Here is the transcript of the segment you need to summarise:", - # "", "[...]\n{transcript}\n[...]", "" - # ), - aggregate_template_rolling = "Here are the segment summaries to aggregate:", + agenda_inference_template = collapse( + "This is the transcript of an event/meeting:\n", + "{transcript}", + "\n", + "The transcript is formatted as a csv with the start and end time of each segment and the segment text." + ), + vocabulary_template = "Mind that the transcript is not perfect and the following and other terms and names may have been wrongly transcribed. Here's a list of technical terms, acronyms and names you may find in the trascript and that may have been wrongly transcribed:\n{vocabulary}.\nRecognize and correct misspelled versions of these and other related terms and names.", diarization_template = collapse( @@ -447,3 +467,190 @@ generate_rolling_aggregation_prompt <- function( stringr::str_glue_data(prompt, .x = args, .null = NULL) } + + +#' Generate the agenda inference prompt +#' +#' This function is used by `infer_agenda_from_transcript()` to generate a +#' prompt for inferring the agenda from a transcript. +#' +#' @param transcript_segment A segment of the transcript to be used for +#' inferring the agenda. Can be a character vector representing the data in CSV +#' format or a data frame. +#' @param args A list of arguments to be passed to the prompt template. They can +#' include: event_description, vocabulary and expected_agenda. +#' +#' @return A prompt used by `infer_agenda_from_transcript()`. +#' +generate_agenda_inference_prompt <- function( + transcript_segment, + args +) { + + if (is.data.frame(transcript_segment)) { + transcript_segment <- readr::format_csv(transcript_segment) + } + + if (!is.null(args$vocabulary)) { + # Format the vocabulary argument if a vector is provided + args$vocabulary <- paste0( + "- ", + args$vocabulary, + collapse = "\n" + ) + } + + # Aggregate instructions if length > 1 vectors and convert into the + # extra_diarization_instructions argument + if (length(args$diarization_instructions) > 0) { + args$extra_diarization_instructions <- paste( + args$diarization_instructions, collapse = "\n" + ) + } + + long_arguments <- purrr::map_lgl(args, ~ length(.x) > 1) + + if (any(long_arguments)) { + stop("All arguments in args should have length 1:\n", + stringr::str_flatten_comma(names(args)[long_arguments])) + } + + prompt <- paste( + "Your task is to extract individual talks from a transcript, creating an agenda.", + + if (!is.null(args$event_description)) { + # Uses the {event_description} argument + get_prompts("event_description_template") + }, + + if (!is.null(args$vocabulary)) { + # Uses the {vocabulary} argument + get_prompts("vocabulary_template") + }, + + # Uses the {extra_diarization_instructions} argument + if (!is.null(args$diarization_instructions)) { + get_prompts("diarization_template") + }, + + "This is the transcript of the event/meeting from which you need to infer the agenda items:\n\n{transcript_segment}\n\n\nThe transcript is formatted as a csv with the start and end time of each segment, the segment text and possibly, the speakers.", + + sep = "\n\n" + ) |> + stringr::str_glue_data(.x = args, .null = NULL) |> + paste( + 'You can identify the talks from a change of speakers, and or, a change of topic. Try to detect broad changes of topics so to avoid splitting the transcript into an excessively large number of small talks; a talk usually last at least 10-15 minutes to one hour, so join into one talk very short change of topics, even if the speaker change. Aggregate talks and the related Q&A sessions in the same talk. + +You wil be FIRST producing an INFORMATION DENSE, step by step reasoning of what could be a good subdivision of the transcript into different talks, considering different competing subdivisions, listing each identified talk start time and topics. THEN you will extract the starting times of each talk. + +Take speakers, topics, and timings into consideration in your reasoning. The reasoning doesn\'t have to be human readable. Favor a high information over length ratio.', + + if (!is.null(args$expected_agenda)) { + stringr::str_glue_data( + .x = args, + .null = NULL, + "The agenda is expected to have the following talks: ### +{expected_agenda} +### +Try to match the agenda you generated to this structure.") + }, + + 'Your output will be a JSON object with two components: your reasoning and the start times of each identified talks. Here\'s an example of the output structure: +### + { + reasoning = "Your reasoning goes here", + start_times = [1, 232, 1242, 2343, 5534, 7023, ...] + } + ### + +Important: process the whole transcript, do not be lazy: your agenda WILL cover the entirety of the transcript, FROM START TO END WITHOUT TIME HOLES.', + + sep ="\n" + ) +} + +#' Generate the prompt to extract an agenda element details from a transcript +#' +#' This function is used by `infer_agenda_from_transcript()` to generate a +#' prompt for extracting the details of an agenda element from a transcript. +#' +#' @param transcript_segment A segment of the transcript to be used for +#' extracting the details of an agenda element. Can be a character vector +#' representing the data in CSV format or a data frame. +#' @param args A list of arguments to be passed to the prompt template. They can +#' include: event_description and vocabulary. +#' +#' @return A prompt used by `infer_agenda_from_transcript()`. +#' +generate_agenda_element_prompt <- function( + transcript_segment, + args +) { + + if (is.data.frame(transcript_segment)) { + transcript_segment <- readr::format_csv(transcript_segment) + } + + if (!is.null(args$vocabulary)) { + # Format the vocabulary argument if a vector is provided + args$vocabulary <- paste0( + "- ", + args$vocabulary, + collapse = "\n" + ) + } + + # Aggregate instructions if length > 1 vectors and convert into the + # extra_diarization_instructions argument + if (length(args$diarization_instructions) > 0) { + args$extra_diarization_instructions <- paste( + args$diarization_instructions, collapse = "\n" + ) + } + + long_arguments <- purrr::map_lgl(args, ~ length(.x) > 1) + + if (any(long_arguments)) { + stop("All arguments in args should have length 1:\n", + stringr::str_flatten_comma(names(args)[long_arguments])) + } + + prompt <- paste( + "This is a segment of the transcript of an event/meeting: + +\n{transcript_segment}\n + +The transcript is formatted as a csv with the start and end time of each segment, the segment text and possibly, the speakers.", + + if (!is.null(args$event_description)) { + # Uses the {event_description} argument + get_prompts("event_description_template") + }, + + if (!is.null(args$vocabulary)) { + # Uses the {vocabulary} argument + get_prompts("vocabulary_template") + }, + + # Uses the {extra_diarization_instructions} argument + if (!is.null(args$diarization_instructions)) { + get_prompts("diarization_template") + }, + + sep = "\n\n" + ) |> + stringr::str_glue_data(.x = args, .null = NULL) |> + paste( + 'Your task is to extract a title and a short description (1-2 sentences max) for this talk, considering that it\'s part of a larger event. Assign also a label, e.g., welcome talk, conference outline, conference talk, meeting discussion, Q&A session, etc... (the start/end times can be helpful for this). Extract also the speakers and the moderators (if any). Format your output as a JSON object with the following structure: ### + { + title = "The talk title", + type = "A label to define the talk", + description = "A description of this talk", + speakers = ["speaker 1", "speaker 2"], + moderators = ["moderator 1"] # If detectable, otherwise ignore this field + } + ###', + + sep = "\n\n" + ) +} diff --git a/R/summarization.R b/R/summarization.R index 17103f0..7d2f5dd 100644 --- a/R/summarization.R +++ b/R/summarization.R @@ -467,3 +467,390 @@ summarise_full_meeting <- function( # Return the result tree invisibly invisible(result_tree) } + + +#' Infer the agenda from a transcript +#' +#' This function takes a transcript and various optional parameters, and uses an +#' LLM to generate an agenda. +#' +#' @param transcript The transcript to be summarised. Can be a file path or a +#' data frame. +#' @param event_description A description of the event. Provide context about +#' the event. +#' @param vocabulary A character vector of specific vocabulary words, names, +#' definitions, to help the LLM recognise misspellings and abbreviations. +#' @param start_time The start time of the event in the HH:MM(:SS)( AM/PM) +#' format. Necessary to convert the agenda times from seconds to an easier to +#' read format. +#' @param expected_agenda The expected agenda of the event. A text description +#' of the expected agenda. If provided, the LLM will be asked to generate an +#' agenda that matches this description. +#' @param window_size The time window that will be taken into account when +#' inferring the agenda. Default is 2 hours. A larger window will increase the +#' accuracy of the agenda since it will provide context and will prevent to +#' have talks crossing the window boundaries; also decrease the chance of +#' having the LLM being over sensitive to small changes in topics, generating +#' too many small talks. However, a larger window will also require a larger +#' LLM context. +#' @param output_file An optional file to save the results to. Default is NULL, +#' i.e., the results are not saved to a file. +#' @param ... Additional arguments passed to the `interrogate_llm` function. +#' Keep in consideration that this function needs LLMs that manages long +#' context and that produce valid JSON outputs. The `force_json` argument is +#' used with OpenAI based LLM but it's not accepted by other LLMs; therefore +#' the user may need to edit the system prompts to ensure that the output is a +#' valid JSON. +#' +#' @return An agenda in the usual list format. +#' +#' @export +#' +infer_agenda_from_transcript <- function( + transcript, + event_description = NULL, + vocabulary = NULL, + diarization_instructions = NULL, + start_time = NULL, + expected_agenda = NULL, + window_size = 3600, + output_file = NULL, + ... +) { + + # Set the default prompts if not already set + set_prompts() + + # import the transcript if it's a file path + if (is.character(transcript)) { + # Is the transcript a CSV file? + if (stringr::str_detect(transcript, "\\.csv$")) { + transcript_data <- readr::read_csv(transcript, show_col_types = FALSE) + } + # Is the transcript a subtitle file? + else { + transcript_data <- import_transcript_from_file(transcript) + } + } else if (is.data.frame(transcript)) { + transcript_data <- transcript + } else { + stop("The transcript must be a file path or a data frame.") + } + + transcript_data <- transcript_data |> + select(start, end, text, any_of("speaker")) |> + mutate( + across(c(start, end), ~ round(.x)), + ) |> + filter(!is_silent(text)) + + breakpoints <- seq( + transcript_data$start[1], max(transcript_data$start), by = window_size) + + pause_duration <- 1200 + + pauses <- transcript_data |> + filter( + start - lag(end, default = 0) > pause_duration + ) |> pull(start) + + breakpoints <- c(breakpoints, pauses) |> sort() + + for (i in which(breakpoints %in% pauses)) { + if (breakpoints[i] - breakpoints[i - 1] < pause_duration) { + breakpoints <- breakpoints[-(i - 1)] + } + + if (breakpoints[i + 1] - breakpoints[i] < pause_duration) { + breakpoints <- breakpoints[-(i + 1)] + } + } + + last_segment <- max(transcript_data$start) - tail(breakpoints, n=1) + + # Adjust if the last segment is less than window_size / 2 seconds + if (last_segment < (window_size / 2)) { + breakpoints <- utils::head(breakpoints, -1) + } + + stop <- FALSE + cur_bp <- 1 + json_error <- FALSE + + # Check if there was an already started session that got interrupted + arg_hash <- rlang::hash( + list( + transcript_data = transcript_data, + event_description = event_description, + vocabulary = vocabulary, + diarization_instructions = diarization_instructions, + start_time = start_time, + expected_agenda = expected_agenda, + window_size = window_size) + ) + + # Reset the temporary agenda if the arguments have changed + if (is.null(getOption("minutemaker_temp_agenda"))) { + options( + "minutemaker_temp_agenda" = list(), + "minutemaker_temp_agenda_last_bp" = NULL + ) + } else if (getOption("minutemaker_temp_agenda_hash", "") != arg_hash) { + options( + "minutemaker_temp_agenda" = list(), + "minutemaker_temp_agenda_last_bp" = NULL + ) + } else { + message("A temporary agenda was found. Resuming the inference.") + } + + options("minutemaker_temp_agenda_hash" = arg_hash) + + update_agenda <- function(agenda_elements) { + cur_agenda <- c( + getOption("minutemaker_temp_agenda", list()), + agenda_elements |> sort() + ) + + options("minutemaker_temp_agenda" = cur_agenda) + } + + message("- Inferring the agenda from the transcript") + + while (isFALSE(stop)) { + + bp_left <- breakpoints[cur_bp] + bp_right <- breakpoints[cur_bp + 1] + + # Stop if reached the end + if (is.na(bp_right)) { + bp_right <- max(transcript_data$start) + 1 + } + + # Check if the current segment was already processed + if (cur_bp <= getOption("minutemaker_temp_agenda_last_bp", 0)) { + if (cur_bp == length(breakpoints)) stop <- TRUE + + cur_bp <- cur_bp + 1 + + next + } + + transcript_segment <- transcript_data |> + dplyr::filter( + .data$start >= bp_left, + .data$start < bp_right + ) + + # Skip empty segments + if (nrow(transcript_segment) == 0) { + if (cur_bp == length(breakpoints)) stop <- TRUE + + cur_bp <- cur_bp + 1 + + next + } + + transcript_segment <- transcript_segment |> readr::format_csv() + + prompt <- generate_agenda_inference_prompt( + transcript_segment, + args = mget( + c("event_description", "vocabulary", + "diarization_instructions", "expected_agenda"), + ifnotfound = list(NULL)) + ) + + # Build the prompt set + prompt_set <- c( + system = get_prompts("persona"), + user = prompt + ) + + # If this is a retry for failed json parsing, add the previous result to the + # prompt set and add instructions to fix the output + if (json_error) { + prompt_set <- c( + prompt_set, + assistant = result_json, + user = "Your output was not a valid JSON. + Please correct it to provide a valid output.") + } + + # Attempt to interrogate the LLM + result_json <- try(interrogate_llm( + prompt_set, + ..., + force_json = TRUE + ), silent = TRUE) + + # If the interrogation fails due to too long output, retry with a smaller + # window + if (inherits(result_json, "try-error") && + grepl("Answer exhausted the context window", result_json)) { + + warning( + "Answer exhausted the context window. retrying...", + immediate. = T, call. = F) + + # Add a new breakpoint in the middle of the current segment + new_bp <- (bp_left + bp_right) / 2 + breakpoints <- sort(c(breakpoints, new_bp)) + + # Prevent stopping, in case the error happened on the last segment + stop <- FALSE + + next + } else if (inherits(result_json, "try-error")) { + + stop(result_json) + + } + + cat(result_json) + + # Attempt to parse the result json + parsed_result <- try( + jsonlite::fromJSON(result_json, simplifyDataFrame = F)$start_times, + silent = TRUE) + + # If the parsing fails... + if (inherits(parsed_result, "try-error")) { + + # If this is the first parsing error, retry with instructions to fix the + # output + if (!json_error) { + warning( + "Output not a valid JSON. retrying...", + immediate. = T, call. = F) + + json_error <- TRUE + } + # If this is the second parsing error, shorten the window + else { + + warning( + "Output not a valid JSON. Shortening the window...", + immediate. = T, call. = F) + + json_error <- FALSE + + # Add a new breakpoint in the middle of the current segment + new_bp <- (bp_left + bp_right) / 2 + breakpoints <- sort(c(breakpoints, new_bp)) + + } + + # Prevent stopping, in case the error happened on the last segment + stop <- FALSE + + next + } + + # If the parsing is successful, update the agenda + update_agenda(parsed_result) + + json_error <- FALSE + + options("minutemaker_temp_agenda_last_bp" = cur_bp) + + if (cur_bp == length(breakpoints)) stop <- TRUE + + cur_bp <- cur_bp + 1 + + } + + agenda_times <- getOption("minutemaker_temp_agenda", list()) + + if (length(agenda_times) == 0) { + warning("No agenda was inferred from the transcript.", + immediate. = T, call. = F) + return(NULL) + } + + # Remove segments that are too short or that precede the previous one. + agenda_times <- agenda_times |> purrr::imap(\(x, i) { + if (i == 1) return(agenda_times[[i]]) + + this_time <- agenda_times[[i]] + prev_time <- agenda_times[[i - 1]] + + # segments should last at least 5 minutes and not be negative + if (this_time - prev_time < 150) return(NULL) + + return(this_time) + }) |> unlist() + + message("- Extracting agenda items details") + + # Extract the talks' details from the transcript + agenda <- purrr::imap(agenda_times, \(start, i) { + # if (i == 1) start <- 1 + + # Stop at the end of the transcript if there is no next agenda element + end <- min( + c(agenda_times[i + 1], max(transcript_data$end)), + na.rm = TRUE) + + # Stop at the pause if there is one in the talk segment + pauses <- pauses[between(pauses, start, end)] + end <- min(c(end, pauses), na.rm = TRUE) + + element <- list( + # Sometimes, int are produced, which creates problems when converting to + # clocktime + from = as.numeric(start), + to = as.numeric(end) + ) + + transcript_segment <- transcript_data |> + filter( + .data$start >= element$from, + .data$end <= element$to, + ) |> readr::format_csv() + + prompt <- generate_agenda_element_prompt( + transcript_segment, + # I cannot use mget here because the prompt function is not in the + # environment of the calling function. Probably there's a way to use mget + # also here + args = list( + event_description = event_description, + vocabulary = vocabulary, + diarization_instructions = diarization_instructions) + ) + + # Build the prompt set + prompt_set <- c( + system = get_prompts("persona"), + user = prompt + ) + + result_json <- interrogate_llm( + prompt_set, + ..., force_json = TRUE + ) + + jsonlite::fromJSON(result_json, simplifyDataFrame = F) |> + c(element) + }) + + if (!is.null(start_time)) { + agenda <- agenda |> + convert_agenda_times( + convert_to = "clocktime", + event_start_time = start_time) + } + + if (!is.null(output_file)) { + dput(agenda, file = output_file) + } + + options( + minutemaker_temp_agenda_last_bp = NULL, + minutemaker_temp_agenda = NULL, + minutemaker_temp_agenda_hash = NULL + ) + + agenda +} diff --git a/README.Rmd b/README.Rmd index e131206..6795020 100644 --- a/README.Rmd +++ b/README.Rmd @@ -51,6 +51,9 @@ provide a longer context window and better summarization quality. Otherwise, the "rolling window" summarization method can be used to produce summaries of consistent quality on long transcripts also with smaller models. +In general, we suggest to use an LLM with a >32K long context window, to avoid +loss of information. + Here is an example workflow. ### Setting up the package @@ -304,6 +307,16 @@ timings are not mandatory if the meeting consists of only one talk. The agenda object itself is not strictly necessary, but can be helpful for long meetings with multiple talks. +An alternative approach is to generate the agenda automatically via the +`infer_agenda_from_transcript()` function. This function uses the transcript to +infer the different sessions of the talk, their start and end times and also +generate a description and a title. The function can use contextual information +to improve the quality of the generated agenda, such as the event description, +the audience, a vocabulary, and the expected agenda. + +It's important to review and correct the inferred agenda, since the function +might not be able to infer the correct structure of the meeting. + ### Summarizing a single meeting transcript The final step is summarizing the transcript. This can be done using the @@ -537,7 +550,16 @@ speech_to_summary_workflow( # Arguments for `summarise_full_meeting` # Assumes an existing agenda.R file in the working directory + # If an agenda doesn't exist, will ask if the LLM should infer it (see + # following arguments) agenda = "agenda.R", + + # Arguments for `infer_agenda_from_transcript` + expected_agenda = NULL, + agenda_generation_window_size = 7200, + agenda_generation_output_file = file.path(target_dir, "agenda.R"), + extra_agenda_generation_args = NULL, + summarization_output_file = "event_summary.R", event_description = event_description, diff --git a/README.md b/README.md index 857194d..4e01e8c 100644 --- a/README.md +++ b/README.md @@ -44,6 +44,9 @@ quality. Otherwise, the “rolling window” summarization method can be used to produce summaries of consistent quality on long transcripts also with smaller models. +In general, we suggest to use an LLM with a \>32K long context window, +to avoid loss of information. + Here is an example workflow. ### Setting up the package @@ -294,6 +297,18 @@ transcript. The timings are not mandatory if the meeting consists of only one talk. The agenda object itself is not strictly necessary, but can be helpful for long meetings with multiple talks. +An alternative approach is to generate the agenda automatically via the +`infer_agenda_from_transcript()` function. This function uses the +transcript to infer the different sessions of the talk, their start and +end times and also generate a description and a title. The function can +use contextual information to improve the quality of the generated +agenda, such as the event description, the audience, a vocabulary, and +the expected agenda. + +It’s important to review and correct the inferred agenda, since the +function might not be able to infer the correct structure of the +meeting. + ### Summarizing a single meeting transcript The final step is summarizing the transcript. This can be done using the @@ -525,7 +540,16 @@ speech_to_summary_workflow( # Arguments for `summarise_full_meeting` # Assumes an existing agenda.R file in the working directory + # If an agenda doesn't exist, will ask if the LLM should infer it (see + # following arguments) agenda = "agenda.R", + + # Arguments for `infer_agenda_from_transcript` + expected_agenda = NULL, + agenda_generation_window_size = 7200, + agenda_generation_output_file = file.path(target_dir, "agenda.R"), + extra_agenda_generation_args = NULL, + summarization_output_file = "event_summary.R", event_description = event_description,