feat: implement agenda inference from transcripts

Add a new function `infer_agenda_from_transcript` and related prompt generation functions. This function automates the generation of an event agenda by analyzing a given transcript. It tries to identify and extract key segments from the transcript, which are then used to construct an agenda. The process can be informed by contextual information such as event description, vocabulary, diarization instructions, and an expected agenda to guide the LLM in generating a more accurate and context-aware agenda. Additionally, the function handles JSON parsing errors and adjusts the processing window size dynamically to ensure valid JSON output from the LLM. The agenda inference process is designed to be robust, with the ability to resume from temporary data if the process is interrupted. This enhancement streamlines the workflow for summarizing meetings and conferences by providing a structured overview of the event's proceedings.
bakaburg1 · Mar 8, 2024 · c458b0d · c458b0d
1 parent e9afb2d
commit c458b0d
Show file tree

Hide file tree

Showing 5 changed files with 694 additions and 23 deletions.
diff --git a/R/data_management.R b/R/data_management.R
@@ -984,8 +984,25 @@ add_chat_transcript <- function(
 #'   generate the chat file. See `add_chat_transcript` for more details.
 #' @param agenda The agenda of the meeting, that is, a list of agenda elements
 #'   each with a session name, a title, speaker and moderator lists, type of
-#'   talk and start and end times. Alternatively, the path to an R file
-#'   containing such a list. See `summarise_full_meeting` for more details.
+#'   talk, talk description and start and end times. Alternatively, the path to
+#'   an R file containing such a list. See `summarise_full_meeting` for more
+#'   details. If NULL, the user will be asked if the system should try to
+#'   generate the agenda automatically, using the `infer_agenda_from_transcript`
+#'   function.
+#' @param expected_agenda A character string. Only used if the `agenda` argument
+#'   is `NULL` and the user requests the automatic agenda generation. this
+#'   string will be used to drive the LLM while generating the agenda. See
+#'   `infer_agenda_from_transcript` for more details.
+#' @param agenda_generation_window_size The size of the window in seconds to
+#'   analyze at once when generating the agenda. See
+#'   `infer_agenda_from_transcript` for more details.
+#' @param agenda_generation_output_file A string with the path to the output
+#'   file where the automatically generated agenda will be written. Should be a
+#'   .R file. See `infer_agenda_from_transcript` for more details.
+#' @param extra_agenda_generation_args Additional arguments passed to the
+#'  `infer_agenda_from_transcript` function. See `infer_agenda_from_transcript`
+#'  for more details. Note that the `diarization_instructions` argument for this
+#'  function will be taken from the `extra_agenda_generation_args` argument.
 #' @param summarization_method A string indicating the summarization method to
 #'   use. See `summarise_full_meeting` for more details.
 #' @param event_description A string containing a description of the meeting.
@@ -1000,9 +1017,9 @@ add_chat_transcript <- function(
 #'   should take into account the diarization of the transcript. See
 #'   `summarise_transcript` for more details.
 #' @param summary_structure,extra_diarization_instructions,extra_output_instructions
-#'   Specific instructions necessary to build the summarisation prompt. See
-#'   `summarise_transcript` for more details and run `get_prompts()` to see the
-#'   defaults. See `summarise_transcript` for more details.
+#' Specific instructions necessary to build the summarisation prompt. See
+#' `summarise_transcript` for more details and run `get_prompts()` to see the
+#' defaults. See `summarise_transcript` for more details.
 #' @param llm_provider A string indicating the LLM provider to use for the
 #'   summarization. See `summarise_transcript` for more details.
 #' @param extra_summarise_args Additional arguments passed to the
@@ -1063,8 +1080,12 @@ speech_to_summary_workflow <- function(
                          , full.names = T)[1],
   chat_format = "webex",
 
-  # Arguments for `summarise_full_meeting`
+  # Arguments for `summarise_full_meeting` and `infer_agenda_from_transcript`
   agenda = file.path(target_dir, "agenda.R"),
+  expected_agenda = NULL,
+  agenda_generation_window_size = 7200,
+  agenda_generation_output_file = file.path(target_dir, "agenda.R"),
+  extra_agenda_generation_args = NULL,
 
   event_description = NULL,
   event_audience = "An audience with understanding of the topic",
@@ -1245,7 +1266,7 @@ speech_to_summary_workflow <- function(
     } else {
       choice <- utils::menu(
         choices = c(
-          "Generate a default agenda (i.e., process the transcript as one talk)",
+          "Generate the agenda automatically (You will need to review it before proceeding)",
           "Exit (write your own agenda)"
         ),
         title = "How do you want to proceed?"
@@ -1258,12 +1279,22 @@ speech_to_summary_workflow <- function(
     }
 
     # Generate a default agenda with 1 talk/meeting if none is provided
-    agenda <- list(
-      list(
-        from = min(transcript_data$start),
-        to = max(transcript_data$end)
-      )
-    )
+    agenda_infer_args <- c(list(
+      transcript = transcript_data,
+      event_description = event_description,
+      vocabulary = vocabulary,
+      diarization_instructions = extra_diarization_instructions,
+      start_time = event_start_time,
+      expected_agenda = expected_agenda,
+      window_size = agenda_generation_window_size,
+      output_file = file.path(target_dir, "agenda.R"),
+      provider = llm_provider
+    ), extra_agenda_generation_args)
+
+    agenda <- do.call(infer_agenda_from_transcript, agenda_infer_args)
+
+    message("Agenda generated. Please review it before proceeding.")
+    return(invisible(transcript_data))
   }
 
   message("\n### Summarizing transcript...\n")

diff --git a/R/prompts.R b/R/prompts.R
@@ -74,6 +74,29 @@ set_prompts <- function(
     base_task = "Your task is to provide a summary of the transcript segments that will be given to you.",
 
     aggregate_task_rolling = "Your task is to aggregate the summaries generated from the segments of a meeting/talk into a single, comprehensive text, without loss of information.",
+    agenda_inference_task = collapse(
+      "Your task is to extract individual talks from this transcript, creating an agenda. You can identify them from a change of speaker, and or, a change of topic. Try to detect broad changes of topics so to avoid splitting the transcript into an excessively large number of small talks; a talk usually last at least 10-15 minutes to one hour, so join into one talk very short change of topics, even if the speaker change. Aggregate both the talk itself and possible Q&A sessions in the same talk.",
+      "You wil be FIRST producing a step by step reasoning of what could be a good subdivision of the transcript into different talks, considering different competing subdivisions, and THEN suggest the agenda. Take speakers, topics, and timings into consideration in your reasoning.",
+      "Your output will be a JSON object with two components: your reasoning and the actual agenda. The agenda must be an array of \"talk\" objects, each with the talk title, a short description (1 sentence), a label with the type of talk (e.g. welcome talk, conference outline, conference talk, meeting discussion, Q&A session, etc...), an array with one of more speakers, another array with moderators (if detectable) and the starting and end time in seconds. Add also the \"session\" object if it make sense as grouping.",
+      "Here's an example of the output structure:",
+      "###
+       {
+        reasoning = \"Your reasoning goes here\",
+        agenda = [
+           {
+            title = \"The talk title\",
+            type = \"Conference talk\",
+            description = \"A description of this talk\",
+            speakers = [\"speaker 1\", \"speaker 2\"],
+            moderators = [\"moderator 1\"] # If detectable, otherwise ignore this field
+            from = 1231, to = 2023
+           },
+            {...}, /* another talk element */
+           ...
+        ]
+       }
+       ###",
+      "Important: process the whole transcript, do not be lazy: your agenda should cover the entirety of the transcript."),
 
     event_description_template = collapse(
       "The following is a description of the event in which the talk/meeting took place, which will provide you with context.",
@@ -93,18 +116,15 @@ set_prompts <- function(
       "<transcript>", "{transcript}", "</transcript>"
     ),
 
-    # transcript_template_one_shot = collapse(
-    #   "Here is the transcript segment you need to summarise:",
-    #   "<transcript>", "{transcript}", "</transcript>"
-    # ),
-    #
-    # transcript_template_rolling = collapse(
-    #   "Here is the transcript of the segment you need to summarise:",
-    #   "<transcript>", "[...]\n{transcript}\n[...]", "</transcript>"
-    # ),
-
     aggregate_template_rolling = "Here are the segment summaries to aggregate:",
 
+    agenda_inference_template = collapse(
+      "This is the transcript of an event/meeting:\n<transcript>",
+      "{transcript}",
+      "</transcript>\n",
+      "The transcript is formatted as a csv with the start and end time of each segment and the segment text."
+    ),
+
     vocabulary_template = "Mind that the transcript is not perfect and the following and other terms and names may have been wrongly transcribed. Here's a list of technical terms, acronyms and names you may find in the trascript and that may have been wrongly transcribed:\n{vocabulary}.\nRecognize and correct misspelled versions of these and other related terms and names.",
 
     diarization_template = collapse(
@@ -447,3 +467,190 @@ generate_rolling_aggregation_prompt <- function(
   stringr::str_glue_data(prompt, .x = args, .null = NULL)
 
 }
+
+
+#' Generate the agenda inference prompt
+#'
+#' This function is used by `infer_agenda_from_transcript()` to generate a
+#' prompt for inferring the agenda from a transcript.
+#'
+#' @param transcript_segment A segment of the transcript to be used for
+#'   inferring the agenda. Can be a character vector representing the data in CSV
+#'   format or a data frame.
+#' @param args A list of arguments to be passed to the prompt template. They can
+#'   include: event_description, vocabulary and expected_agenda.
+#'
+#' @return A prompt used by `infer_agenda_from_transcript()`.
+#'
+generate_agenda_inference_prompt <- function(
+    transcript_segment,
+    args
+) {
+
+  if (is.data.frame(transcript_segment)) {
+    transcript_segment <- readr::format_csv(transcript_segment)
+  }
+
+  if (!is.null(args$vocabulary)) {
+    # Format the vocabulary argument if a vector is provided
+    args$vocabulary <- paste0(
+      "- ",
+      args$vocabulary,
+      collapse = "\n"
+    )
+  }
+
+  # Aggregate instructions if length > 1 vectors and convert into the
+  # extra_diarization_instructions argument
+  if (length(args$diarization_instructions) > 0) {
+    args$extra_diarization_instructions <- paste(
+      args$diarization_instructions, collapse = "\n"
+    )
+  }
+
+  long_arguments <- purrr::map_lgl(args, ~ length(.x) > 1)
+
+  if (any(long_arguments)) {
+    stop("All arguments in args should have length 1:\n",
+         stringr::str_flatten_comma(names(args)[long_arguments]))
+  }
+
+  prompt <- paste(
+    "Your task is to extract individual talks from a transcript, creating an agenda.",
+
+    if (!is.null(args$event_description)) {
+      # Uses the {event_description} argument
+      get_prompts("event_description_template")
+    },
+
+    if (!is.null(args$vocabulary)) {
+      # Uses the {vocabulary} argument
+      get_prompts("vocabulary_template")
+    },
+
+    # Uses the {extra_diarization_instructions} argument
+    if (!is.null(args$diarization_instructions)) {
+      get_prompts("diarization_template")
+    },
+
+    "This is the transcript of the event/meeting from which you need to infer the agenda items:\n<transcript>\n{transcript_segment}\n</transcript>\n\nThe transcript is formatted as a csv with the start and end time of each segment, the segment text and possibly, the speakers.",
+
+    sep = "\n\n"
+  ) |>
+    stringr::str_glue_data(.x = args, .null = NULL) |>
+    paste(
+      'You can identify the talks from a change of speakers, and or, a change of topic. Try to detect broad changes of topics so to avoid splitting the transcript into an excessively large number of small talks; a talk usually last at least 10-15 minutes to one hour, so join into one talk very short change of topics, even if the speaker change. Aggregate talks and the related Q&A sessions in the same talk.
+
+You wil be FIRST producing an INFORMATION DENSE, step by step reasoning of what could be a good subdivision of the transcript into different talks, considering different competing subdivisions, listing each identified talk start time and topics. THEN you will extract the starting times of each talk.
+
+Take speakers, topics, and timings into consideration in your reasoning. The reasoning doesn\'t have to be human readable. Favor a high information over length ratio.',
+
+      if (!is.null(args$expected_agenda)) {
+        stringr::str_glue_data(
+          .x = args,
+          .null = NULL,
+          "The agenda is expected to have the following talks: ###
+{expected_agenda}
+###
+Try to match the agenda you generated to this structure.")
+      },
+
+      'Your output will be a JSON object with two components: your reasoning and the start times of each identified talks. Here\'s an example of the output structure:
+###
+ {
+  reasoning = "Your reasoning goes here",
+  start_times = [1, 232, 1242, 2343, 5534, 7023, ...]
+ }
+ ###
+
+Important: process the whole transcript, do not be lazy: your agenda WILL cover the entirety of the transcript, FROM START TO END WITHOUT TIME HOLES.',
+
+      sep ="\n"
+    )
+}
+
+#' Generate the prompt to extract an agenda element details from a transcript
+#'
+#' This function is used by `infer_agenda_from_transcript()` to generate a
+#' prompt for extracting the details of an agenda element from a transcript.
+#'
+#' @param transcript_segment A segment of the transcript to be used for
+#'   extracting the details of an agenda element. Can be a character vector
+#'   representing the data in CSV format or a data frame.
+#' @param args A list of arguments to be passed to the prompt template. They can
+#'   include: event_description and vocabulary.
+#'
+#' @return A prompt used by `infer_agenda_from_transcript()`.
+#'
+generate_agenda_element_prompt <- function(
+    transcript_segment,
+    args
+) {
+
+  if (is.data.frame(transcript_segment)) {
+    transcript_segment <- readr::format_csv(transcript_segment)
+  }
+
+  if (!is.null(args$vocabulary)) {
+    # Format the vocabulary argument if a vector is provided
+    args$vocabulary <- paste0(
+      "- ",
+      args$vocabulary,
+      collapse = "\n"
+    )
+  }
+
+  # Aggregate instructions if length > 1 vectors and convert into the
+  # extra_diarization_instructions argument
+  if (length(args$diarization_instructions) > 0) {
+    args$extra_diarization_instructions <- paste(
+      args$diarization_instructions, collapse = "\n"
+    )
+  }
+
+  long_arguments <- purrr::map_lgl(args, ~ length(.x) > 1)
+
+  if (any(long_arguments)) {
+    stop("All arguments in args should have length 1:\n",
+         stringr::str_flatten_comma(names(args)[long_arguments]))
+  }
+
+  prompt <- paste(
+    "This is a segment of the transcript of an event/meeting:
+
+<transcript>\n{transcript_segment}\n</transcript>
+
+The transcript is formatted as a csv with the start and end time of each segment, the segment text and possibly, the speakers.",
+
+    if (!is.null(args$event_description)) {
+      # Uses the {event_description} argument
+      get_prompts("event_description_template")
+    },
+
+    if (!is.null(args$vocabulary)) {
+      # Uses the {vocabulary} argument
+      get_prompts("vocabulary_template")
+    },
+
+    # Uses the {extra_diarization_instructions} argument
+    if (!is.null(args$diarization_instructions)) {
+      get_prompts("diarization_template")
+    },
+
+    sep = "\n\n"
+  ) |>
+    stringr::str_glue_data(.x = args, .null = NULL) |>
+    paste(
+    'Your task is to extract a title and a short description (1-2 sentences max) for this talk, considering that it\'s part of a larger event. Assign also a label, e.g., welcome talk, conference outline, conference talk, meeting discussion, Q&A session, etc... (the start/end times can be helpful for this). Extract also the speakers and the moderators (if any). Format your output as a JSON object with the following structure: ###
+        {
+            title = "The talk title",
+            type = "A label to define the talk",
+            description = "A description of this talk",
+            speakers = ["speaker 1", "speaker 2"],
+            moderators = ["moderator 1"] # If detectable, otherwise ignore this field
+           }
+        ###',
+
+    sep = "\n\n"
+  )
+}