Merge pull request #17 from bakaburg1/feature/infer-agenda

Implement Agenda Inference from Transcripts and Enhance Summarization Workflow
bakaburg1 · Mar 8, 2024 · d7b5a85 · d7b5a85
2 parents 3636930 + 4584c73
commit d7b5a85
Show file tree

Hide file tree

Showing 21 changed files with 2,625 additions and 47 deletions.
diff --git a/.Rprofile b/.Rprofile
@@ -1 +1 @@
-
+source("renv/activate.R")
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,10 +1,11 @@
 Package: minutemaker
 Title: GenAI-based meeting and conferences minutes generator
-Version: 0.5.5
+Version: 0.6.0
 Authors@R: 
     person("Angelo", "D'Ambrosio", , "a.dambrosioMD@gmail.com", role = c("aut", "cre"),
            comment = c(ORCID = "0000-0002-2045-5155"))
-Description: Generate meeting minutes starting from an audio recording or a transcripts using speech-to-text and LLMs.
+Description: Generate meeting minutes starting from an audio recording or a
+    transcripts using speech-to-text and LLMs.
 License: MIT + file LICENSE
 Imports: 
     dplyr (>= 1.1.4),
@@ -20,10 +21,12 @@ Imports:
 Config/testthat/edition: 3
 Encoding: UTF-8
 Roxygen: list(markdown = TRUE)
-RoxygenNote: 7.2.3
+RoxygenNote: 7.3.1
 Suggests: 
     av (>= 0.9.0),
+    devtools (>= 2.4.5),
     parallel (>= 4.3.2),
     testthat (>= 3.0.0),
     text2vec (>= 0.6.4),
     tictoc (>= 1.2),
+    usethis (>= 2.2.3)
diff --git a/NAMESPACE b/NAMESPACE
@@ -7,6 +7,7 @@ export(format_summary_tree)
 export(generate_recording_details)
 export(get_prompts)
 export(import_transcript_from_file)
+export(infer_agenda_from_transcript)
 export(interrogate_llm)
 export(merge_transcripts)
 export(parse_transcript_json)

diff --git a/R/LLM_calls.R b/R/LLM_calls.R
@@ -35,10 +35,17 @@
 #'
 process_messages <- function(messages) {
 
-  if (missing(messages) || is.null(messages)) {
+  if (missing(messages) || is.null(messages) || length(messages) == 0) {
     stop("User messages are required.")
   }
 
+  # Assume that a single message is from the user
+  if (length(messages) == 1 &&
+      is.character(messages) &&
+      is.null(names(messages))) {
+    messages <- c(user = messages)
+  }
+
   # Convert vector to list format
   vector_to_list <- function(msg_vec) {
 
@@ -199,7 +206,7 @@ interrogate_llm <- function(
 
     if (httr::status_code(response) == 429) {
       warning("Rate limit exceeded. Waiting before retrying.",
-              immediate. = TRUE)
+              immediate. = TRUE, call. = FALSE)
 
       to_wait <- as.numeric(httr::headers(response)$`retry-after`)
       message("Waiting for ", to_wait, " seconds.\n...")

diff --git a/R/data_management.R b/R/data_management.R
@@ -81,6 +81,11 @@ parse_transcript_json <- function(
            "Please remove it and try transcription again.")
     }
 
+    if (length(transcript_list[[i]]$segments) == 0) {
+      # skip this file, there was nothing to transcribe
+      next
+    }
+
     transcript_data <- transcript_list[[i]]$segments |>
       bind_rows() |>
       # Select only the columns to import
@@ -280,11 +285,12 @@ extract_text_from_transcript <- function(
 
   # Ignore the `import_diarization` parameter if the transcript does not contain
   # speaker information
-  if (all(is.na(transcript_data$speaker)) ||
+  if (!"speaker" %in% names(transcript_data) ||
+      all(is.na(transcript_data$speaker)) ||
       n_distinct(transcript_data$speaker, na.rm = T) == 1) {
     import_diarization <- FALSE
 
-    transcript_data$speaker <- "None"
+    transcript_data$speaker <- "Unknown"
   }
 
   transcript <- transcript_data %>%
@@ -386,16 +392,6 @@ convert_agenda_times <- function(
     validate_agenda_element(agenda[[i]], from = TRUE, to = TRUE)
   }
 
-  # if (
-  #   convert_to == "clocktime" &&
-  #   inherits(agenda[[1]]["from"], c("POSIXct", "character"))) {
-  #
-  #   warning("Agenda already in clock time format.",
-  #           call. = FALSE, immediate. = TRUE)
-  #
-  #   return(agenda_orig)
-  # }
-
   # Check if agenda times are all of the same class
   if (!all(purrr::map_lgl(agenda, ~ is.numeric(.x$from))) &&
       !all(purrr::map_lgl(agenda, ~ {
@@ -988,8 +984,25 @@ add_chat_transcript <- function(
 #'   generate the chat file. See `add_chat_transcript` for more details.
 #' @param agenda The agenda of the meeting, that is, a list of agenda elements
 #'   each with a session name, a title, speaker and moderator lists, type of
-#'   talk and start and end times. Alternatively, the path to an R file
-#'   containing such a list. See `summarise_full_meeting` for more details.
+#'   talk, talk description and start and end times. Alternatively, the path to
+#'   an R file containing such a list. See `summarise_full_meeting` for more
+#'   details. If NULL, the user will be asked if the system should try to
+#'   generate the agenda automatically, using the `infer_agenda_from_transcript`
+#'   function.
+#' @param expected_agenda A character string. Only used if the `agenda` argument
+#'   is `NULL` and the user requests the automatic agenda generation. this
+#'   string will be used to drive the LLM while generating the agenda. See
+#'   `infer_agenda_from_transcript` for more details.
+#' @param agenda_generation_window_size The size of the window in seconds to
+#'   analyze at once when generating the agenda. See
+#'   `infer_agenda_from_transcript` for more details.
+#' @param agenda_generation_output_file A string with the path to the output
+#'   file where the automatically generated agenda will be written. Should be a
+#'   .R file. See `infer_agenda_from_transcript` for more details.
+#' @param extra_agenda_generation_args Additional arguments passed to the
+#'  `infer_agenda_from_transcript` function. See `infer_agenda_from_transcript`
+#'  for more details. Note that the `diarization_instructions` argument for this
+#'  function will be taken from the `extra_agenda_generation_args` argument.
 #' @param summarization_method A string indicating the summarization method to
 #'   use. See `summarise_full_meeting` for more details.
 #' @param event_description A string containing a description of the meeting.
@@ -1004,9 +1017,9 @@ add_chat_transcript <- function(
 #'   should take into account the diarization of the transcript. See
 #'   `summarise_transcript` for more details.
 #' @param summary_structure,extra_diarization_instructions,extra_output_instructions
-#'   Specific instructions necessary to build the summarisation prompt. See
-#'   `summarise_transcript` for more details and run `get_prompts()` to see the
-#'   defaults. See `summarise_transcript` for more details.
+#' Specific instructions necessary to build the summarisation prompt. See
+#' `summarise_transcript` for more details and run `get_prompts()` to see the
+#' defaults. See `summarise_transcript` for more details.
 #' @param llm_provider A string indicating the LLM provider to use for the
 #'   summarization. See `summarise_transcript` for more details.
 #' @param extra_summarise_args Additional arguments passed to the
@@ -1067,8 +1080,12 @@ speech_to_summary_workflow <- function(
                          , full.names = T)[1],
   chat_format = "webex",
 
-  # Arguments for `summarise_full_meeting`
+  # Arguments for `summarise_full_meeting` and `infer_agenda_from_transcript`
   agenda = file.path(target_dir, "agenda.R"),
+  expected_agenda = NULL,
+  agenda_generation_window_size = 3600,
+  agenda_generation_output_file = file.path(target_dir, "agenda.R"),
+  extra_agenda_generation_args = NULL,
 
   event_description = NULL,
   event_audience = "An audience with understanding of the topic",
@@ -1249,7 +1266,7 @@ speech_to_summary_workflow <- function(
     } else {
       choice <- utils::menu(
         choices = c(
-          "Generate a default agenda (i.e., process the transcript as one talk)",
+          "Generate the agenda automatically (You will need to review it before proceeding)",
           "Exit (write your own agenda)"
         ),
         title = "How do you want to proceed?"
@@ -1262,12 +1279,22 @@ speech_to_summary_workflow <- function(
     }
 
     # Generate a default agenda with 1 talk/meeting if none is provided
-    agenda <- list(
-      list(
-        from = min(transcript_data$start),
-        to = max(transcript_data$end)
-      )
-    )
+    agenda_infer_args <- c(list(
+      transcript = transcript_data,
+      event_description = event_description,
+      vocabulary = vocabulary,
+      diarization_instructions = extra_diarization_instructions,
+      start_time = event_start_time,
+      expected_agenda = expected_agenda,
+      window_size = agenda_generation_window_size,
+      output_file = file.path(target_dir, "agenda.R"),
+      provider = llm_provider
+    ), extra_agenda_generation_args)
+
+    agenda <- do.call(infer_agenda_from_transcript, agenda_infer_args)
+
+    message("Agenda generated. Please review it before proceeding.")
+    return(invisible(transcript_data))
   }
 
   message("\n### Summarizing transcript...\n")