Merge pull request #19 from bakaburg1/Dev

v0.7.0: Manage events without agendas in the summarisation workflow
bakaburg1 · Apr 11, 2024 · 5311889 · 5311889
2 parents e997b75 + 9c9d184
commit 5311889
Show file tree

Hide file tree

Showing 11 changed files with 355 additions and 59 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -16,6 +16,7 @@ Imports:
     readr (>= 2.1.4),
     rlang (>= 1.1.2),
     stringr (>= 1.5.1),
+    styler (>= 1.10.2),
     tools (>= 4.3.2),
     vctrs (>= 0.6.5)
 Config/testthat/edition: 3

diff --git a/NAMESPACE b/NAMESPACE
@@ -2,6 +2,7 @@
 
 export(add_chat_transcript)
 export(clean_transcript)
+export(entity_extractor)
 export(extract_text_from_transcript)
 export(format_summary_tree)
 export(generate_recording_details)

diff --git a/R/data_management.R b/R/data_management.R
@@ -623,7 +623,10 @@ import_transcript_from_file <- function(
       as.numeric()
 
     # Extract the text
-    text <- lines[.x + 1]
+    text <- lines[.x + 1] |>
+      # MS Teams vtt has an xml tag with speaker info
+      stringr::str_remove_all("</?v.*?>") |>
+      stringr::str_squish()
 
     # Return the data
     extract <- data.frame(
@@ -633,12 +636,23 @@ import_transcript_from_file <- function(
     )
 
     # If the transcript is diarized, extract the speaker
-    if (import_diarization && lines[.x - 1] != "") {
+    if (import_diarization) {
       # Extract the speaker
-      cur_speaker <- lines[.x - 1] |>
+      cur_speaker <- lines[.x - 1]
+
+      # Normal VTT style
+      if (stringr::str_detect(cur_speaker, "^\\d+ ")) {
         # the name is between double quotes
         stringr::str_extract_all('(?<=").*(?=")') |>
         unlist()
+      } else if (stringr::str_detect(lines[.x + 1], "^<v ")) {
+        # MS Teams vtt style: <v speaker first name [second name]>
+        cur_speaker <- stringr::str_extract_all(
+          lines[.x + 1], '(?<=<v ).*?(?=>)') |>
+        unlist()
+      } else {
+        cur_speaker <- NA
+      }
 
       # In the unlikely case that there are multiple speakers, join them
       if (length(cur_speaker) > 1) {
@@ -988,7 +1002,7 @@ add_chat_transcript <- function(
 #'   an R file containing such a list. See `summarise_full_meeting` for more
 #'   details. If NULL, the user will be asked if the system should try to
 #'   generate the agenda automatically, using the `infer_agenda_from_transcript`
-#'   function.
+#'   function. If FALSE, the agenda will not be used.
 #' @param expected_agenda A character string. Only used if the `agenda` argument
 #'   is `NULL` and the user requests the automatic agenda generation. this
 #'   string will be used to drive the LLM while generating the agenda. See
@@ -1007,7 +1021,7 @@ add_chat_transcript <- function(
 #'   use. See `summarise_full_meeting` for more details.
 #' @param event_description A string containing a description of the meeting.
 #'   See `summarise_transcript` for more details.
-#' @param event_audience A string containing a description of the audience of
+#' @param audience A string containing a description of the audience of
 #'   the meeting and what to focus on in the summary. See `summarise_transcript`
 #'   for more details.
 #' @param vocabulary A character vector of specific vocabulary words, names,
@@ -1088,7 +1102,7 @@ speech_to_summary_workflow <- function(
   extra_agenda_generation_args = NULL,
 
   event_description = NULL,
-  event_audience = "An audience with understanding of the topic",
+  audience = "An audience with understanding of the topic",
   vocabulary = NULL,
   consider_diarization = TRUE,
   summary_structure = get_prompts("summary_structure"),
@@ -1251,9 +1265,21 @@ speech_to_summary_workflow <- function(
 
   ## Perform summarization ##
 
+  if (length(agenda) > 1) {
+    stop("The agenda argument should be of length 1.")
+  }
+
+  # If the agenda argument is a character and the file does not exist, stop the
+  # process
+  if (!isFALSE(agenda) && !purrr::is_empty(agenda) &&
+      is.character(agenda) && !file.exists(agenda)) {
+    stop("The agenda file does not exist. Use agenda = FALSE to not use an
+         agenda in the creation of the summary, or agenda = NULL to generate
+         the agenda automatically (may take time).")
+  }
+
   # Agenda is not provided, ask whether to generate a default agenda
-  if (purrr::is_empty(agenda) ||
-      (is.character(agenda) && !file.exists(agenda))) {
+  if (!isFALSE(agenda) && purrr::is_empty(agenda)) {
 
     cat("No agenda was provided or found in the target directory.\n")
 
@@ -1267,34 +1293,42 @@ speech_to_summary_workflow <- function(
       choice <- utils::menu(
         choices = c(
           "Generate the agenda automatically (You will need to review it before proceeding)",
+          "Do not generate an agenda and proceed with one overall summary.",
           "Exit (write your own agenda)"
         ),
         title = "How do you want to proceed?"
       )
     }
 
-    if (choice != 1) {
+    if (choice == 2) {
+      message("Proceeding without an agenda. We suggest to use the rolling
+              window summarization method for recordings longer than 1 hour.")
+      agenda <- FALSE
+    } else if (choice != 1) {
       message("Aborted by user. Returning transcript data only (invisibly).")
       return(invisible(transcript_data))
-    }
+    } else {
 
-    # Generate a default agenda with 1 talk/meeting if none is provided
-    agenda_infer_args <- c(list(
-      transcript = transcript_data,
-      event_description = event_description,
-      vocabulary = vocabulary,
-      diarization_instructions = extra_diarization_instructions,
-      start_time = event_start_time,
-      expected_agenda = expected_agenda,
-      window_size = agenda_generation_window_size,
-      output_file = file.path(target_dir, "agenda.R"),
-      provider = llm_provider
-    ), extra_agenda_generation_args)
-
-    agenda <- do.call(infer_agenda_from_transcript, agenda_infer_args)
-
-    message("Agenda generated. Please review it before proceeding.")
-    return(invisible(transcript_data))
+      # Generate a default agenda with 1 talk/meeting if none is provided
+      agenda_infer_args <- c(list(
+        transcript = transcript_data,
+        event_description = event_description,
+        vocabulary = vocabulary,
+        diarization_instructions = extra_diarization_instructions,
+        start_time = event_start_time,
+        expected_agenda = expected_agenda,
+        window_size = agenda_generation_window_size,
+        output_file = if (!purrr::is_empty(agenda) && is.character(agenda)) {
+                          file.path(target_dir, basename(agenda))
+          } else file.path(target_dir, "agenda.R"),
+        provider = llm_provider
+      ), extra_agenda_generation_args)
+
+      agenda <- do.call(infer_agenda_from_transcript, agenda_infer_args)
+
+      message("Agenda generated. Please review it before proceeding.")
+      return(invisible(transcript_data))
+    }
   }
 
   message("\n### Summarizing transcript...\n")
@@ -1303,49 +1337,88 @@ speech_to_summary_workflow <- function(
     stop("No LLM provider defined.")
   }
 
+  # Manage situations where the formatted output file exists
+  if (!is.null(formatted_output_file) &&
+      isFALSE(overwrite_formatted_output) &&
+      file.exists(formatted_output_file)) {
+
+    if (interactive()) {
+      choice <- utils::menu(
+        choices = c(
+          "Overwrite the existing formatted summary file",
+          "Abort the process"
+        ),
+        title = "The formatted summary output file already exists and overwrite is FALSE. What do you want to do?"
+      )
+
+      if (choice == 2) {
+        message("Aborted by user.")
+        return(invisible(transcript_data))
+
+      } else {
+        message("Overwriting the existing formatted summary file.")
+      }
+    } else {
+      message("The formatted summary output file already exists and overwrite is FALSE.\nSet overwrite_formatted_output = TRUE to overwrite it or remove it.")
+      return(invisible(transcript_data))
+    }
+
+  }
+
+  # Common summarization arguments
   summarization_args <- c(list(
     transcript_data = transcript_data,
-    agenda = agenda,
     method = summarization_method,
 
     window_size = summarization_window_size,
     output_length = summarization_output_length,
 
-    output_file = summarization_output_file,
-
-    event_start_time = event_start_time,
     event_description = event_description,
-    event_audience = event_audience,
+    audience = audience,
     vocabulary = vocabulary,
     consider_diarization = consider_diarization,
 
     summary_structure = summary_structure,
     extra_diarization_instructions = extra_diarization_instructions,
     extra_output_instructions = extra_output_instructions,
 
-    provider = llm_provider,
-    overwrite = overwrite_summary_tree
+    provider = llm_provider
   ), extra_summarise_args)
 
+  if (isFALSE(agenda)) {
+    # Summarize as single talk
+
+    formatted_summary <- do.call(summarise_transcript, summarization_args)
 
-  summary_tree <- do.call(summarise_full_meeting, summarization_args)
+    return_vec <- c("transcript_data", "formatted_summary")
+
+  } else {
 
-  ## Format summary tree ##
+    # Summarize as multiple talks
 
-  if (overwrite_formatted_output || !file.exists(formatted_output_file)) {
+    # Necessary extra arguments for the summarization of whole events
+    summarization_args$agenda <- agenda
+    summarization_args$overwrite <- overwrite_summary_tree
+    summarization_args$output_file <- summarization_output_file
+    summarization_args$event_start_time <- event_start_time
 
+    summary_tree <- do.call(summarise_full_meeting, summarization_args)
+
+    ## Format summary tree ##
     message("\n### Formatting summary tree...\n")
 
     formatted_summary <- format_summary_tree(
       summary_tree = summary_tree,
       agenda = agenda,
       event_start_time = event_start_time,
-      output_file = formatted_output_file)
+      output_file = NULL)
 
-  } else {
-    message("\n### Loading existing formatted summary...\n")
-    formatted_summary <- readr::read_file(formatted_output_file)
+    return_vec <- c("transcript_data", "summary_tree", "formatted_summary")
   }
 
-  mget(c("transcript_data", "summary_tree", "formatted_summary"))
+  message("\n### Writing to file...\n")
+
+  readr::write_lines(formatted_summary, formatted_output_file)
+
+  return(mget(return_vec))
 }
diff --git a/R/prompts.R b/R/prompts.R
@@ -646,11 +646,19 @@ The transcript is formatted as a csv with the start and end time of each segment
             title = "The talk title",
             type = "A label to define the talk",
             description = "A description of this talk",
-            speakers = ["speaker 1", "speaker 2"],
-            moderators = ["moderator 1"] # If detectable, otherwise ignore this field
+            speakers = ["a list of speakers"] # If detectable, otherwise ignore this,
+            moderators = ["a list of moderatora"] # If detectable/appropriate, otherwise ignore this field
            }
         ###',
 
+    if (!is.null(args$expected_agenda_element)) {
+      paste("The event expected agenda is the following, so try to match the extracted talk to this structure. But feel free to describe a novel element if you cannot find a logical match, since there could have been unexpected changes in the agenda: ###\n",
+            args$expected_agenda_element,
+            "\n###")
+    },
+
+    "Provide your output.",
+
     sep = "\n\n"
   )
 }