Skip to content

Commit

Permalink
Merge pull request #19 from bakaburg1/Dev
Browse files Browse the repository at this point in the history
v0.7.0: Manage events without agendas in the summarisation workflow
  • Loading branch information
bakaburg1 authored Apr 11, 2024
2 parents e997b75 + 9c9d184 commit 5311889
Show file tree
Hide file tree
Showing 11 changed files with 355 additions and 59 deletions.
1 change: 1 addition & 0 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ Imports:
readr (>= 2.1.4),
rlang (>= 1.1.2),
stringr (>= 1.5.1),
styler (>= 1.10.2),
tools (>= 4.3.2),
vctrs (>= 0.6.5)
Config/testthat/edition: 3
Expand Down
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

export(add_chat_transcript)
export(clean_transcript)
export(entity_extractor)
export(extract_text_from_transcript)
export(format_summary_tree)
export(generate_recording_details)
Expand Down
157 changes: 115 additions & 42 deletions R/data_management.R
Original file line number Diff line number Diff line change
Expand Up @@ -623,7 +623,10 @@ import_transcript_from_file <- function(
as.numeric()

# Extract the text
text <- lines[.x + 1]
text <- lines[.x + 1] |>
# MS Teams vtt has an xml tag with speaker info
stringr::str_remove_all("</?v.*?>") |>
stringr::str_squish()

# Return the data
extract <- data.frame(
Expand All @@ -633,12 +636,23 @@ import_transcript_from_file <- function(
)

# If the transcript is diarized, extract the speaker
if (import_diarization && lines[.x - 1] != "") {
if (import_diarization) {
# Extract the speaker
cur_speaker <- lines[.x - 1] |>
cur_speaker <- lines[.x - 1]

# Normal VTT style
if (stringr::str_detect(cur_speaker, "^\\d+ ")) {
# the name is between double quotes
stringr::str_extract_all('(?<=").*(?=")') |>
unlist()
} else if (stringr::str_detect(lines[.x + 1], "^<v ")) {
# MS Teams vtt style: <v speaker first name [second name]>
cur_speaker <- stringr::str_extract_all(
lines[.x + 1], '(?<=<v ).*?(?=>)') |>
unlist()
} else {
cur_speaker <- NA
}

# In the unlikely case that there are multiple speakers, join them
if (length(cur_speaker) > 1) {
Expand Down Expand Up @@ -988,7 +1002,7 @@ add_chat_transcript <- function(
#' an R file containing such a list. See `summarise_full_meeting` for more
#' details. If NULL, the user will be asked if the system should try to
#' generate the agenda automatically, using the `infer_agenda_from_transcript`
#' function.
#' function. If FALSE, the agenda will not be used.
#' @param expected_agenda A character string. Only used if the `agenda` argument
#' is `NULL` and the user requests the automatic agenda generation. this
#' string will be used to drive the LLM while generating the agenda. See
Expand All @@ -1007,7 +1021,7 @@ add_chat_transcript <- function(
#' use. See `summarise_full_meeting` for more details.
#' @param event_description A string containing a description of the meeting.
#' See `summarise_transcript` for more details.
#' @param event_audience A string containing a description of the audience of
#' @param audience A string containing a description of the audience of
#' the meeting and what to focus on in the summary. See `summarise_transcript`
#' for more details.
#' @param vocabulary A character vector of specific vocabulary words, names,
Expand Down Expand Up @@ -1088,7 +1102,7 @@ speech_to_summary_workflow <- function(
extra_agenda_generation_args = NULL,

event_description = NULL,
event_audience = "An audience with understanding of the topic",
audience = "An audience with understanding of the topic",
vocabulary = NULL,
consider_diarization = TRUE,
summary_structure = get_prompts("summary_structure"),
Expand Down Expand Up @@ -1251,9 +1265,21 @@ speech_to_summary_workflow <- function(

## Perform summarization ##

if (length(agenda) > 1) {
stop("The agenda argument should be of length 1.")
}

# If the agenda argument is a character and the file does not exist, stop the
# process
if (!isFALSE(agenda) && !purrr::is_empty(agenda) &&
is.character(agenda) && !file.exists(agenda)) {
stop("The agenda file does not exist. Use agenda = FALSE to not use an
agenda in the creation of the summary, or agenda = NULL to generate
the agenda automatically (may take time).")
}

# Agenda is not provided, ask whether to generate a default agenda
if (purrr::is_empty(agenda) ||
(is.character(agenda) && !file.exists(agenda))) {
if (!isFALSE(agenda) && purrr::is_empty(agenda)) {

cat("No agenda was provided or found in the target directory.\n")

Expand All @@ -1267,34 +1293,42 @@ speech_to_summary_workflow <- function(
choice <- utils::menu(
choices = c(
"Generate the agenda automatically (You will need to review it before proceeding)",
"Do not generate an agenda and proceed with one overall summary.",
"Exit (write your own agenda)"
),
title = "How do you want to proceed?"
)
}

if (choice != 1) {
if (choice == 2) {
message("Proceeding without an agenda. We suggest to use the rolling
window summarization method for recordings longer than 1 hour.")
agenda <- FALSE
} else if (choice != 1) {
message("Aborted by user. Returning transcript data only (invisibly).")
return(invisible(transcript_data))
}
} else {

# Generate a default agenda with 1 talk/meeting if none is provided
agenda_infer_args <- c(list(
transcript = transcript_data,
event_description = event_description,
vocabulary = vocabulary,
diarization_instructions = extra_diarization_instructions,
start_time = event_start_time,
expected_agenda = expected_agenda,
window_size = agenda_generation_window_size,
output_file = file.path(target_dir, "agenda.R"),
provider = llm_provider
), extra_agenda_generation_args)

agenda <- do.call(infer_agenda_from_transcript, agenda_infer_args)

message("Agenda generated. Please review it before proceeding.")
return(invisible(transcript_data))
# Generate a default agenda with 1 talk/meeting if none is provided
agenda_infer_args <- c(list(
transcript = transcript_data,
event_description = event_description,
vocabulary = vocabulary,
diarization_instructions = extra_diarization_instructions,
start_time = event_start_time,
expected_agenda = expected_agenda,
window_size = agenda_generation_window_size,
output_file = if (!purrr::is_empty(agenda) && is.character(agenda)) {
file.path(target_dir, basename(agenda))
} else file.path(target_dir, "agenda.R"),
provider = llm_provider
), extra_agenda_generation_args)

agenda <- do.call(infer_agenda_from_transcript, agenda_infer_args)

message("Agenda generated. Please review it before proceeding.")
return(invisible(transcript_data))
}
}

message("\n### Summarizing transcript...\n")
Expand All @@ -1303,49 +1337,88 @@ speech_to_summary_workflow <- function(
stop("No LLM provider defined.")
}

# Manage situations where the formatted output file exists
if (!is.null(formatted_output_file) &&
isFALSE(overwrite_formatted_output) &&
file.exists(formatted_output_file)) {

if (interactive()) {
choice <- utils::menu(
choices = c(
"Overwrite the existing formatted summary file",
"Abort the process"
),
title = "The formatted summary output file already exists and overwrite is FALSE. What do you want to do?"
)

if (choice == 2) {
message("Aborted by user.")
return(invisible(transcript_data))

} else {
message("Overwriting the existing formatted summary file.")
}
} else {
message("The formatted summary output file already exists and overwrite is FALSE.\nSet overwrite_formatted_output = TRUE to overwrite it or remove it.")
return(invisible(transcript_data))
}

}

# Common summarization arguments
summarization_args <- c(list(
transcript_data = transcript_data,
agenda = agenda,
method = summarization_method,

window_size = summarization_window_size,
output_length = summarization_output_length,

output_file = summarization_output_file,

event_start_time = event_start_time,
event_description = event_description,
event_audience = event_audience,
audience = audience,
vocabulary = vocabulary,
consider_diarization = consider_diarization,

summary_structure = summary_structure,
extra_diarization_instructions = extra_diarization_instructions,
extra_output_instructions = extra_output_instructions,

provider = llm_provider,
overwrite = overwrite_summary_tree
provider = llm_provider
), extra_summarise_args)

if (isFALSE(agenda)) {
# Summarize as single talk

formatted_summary <- do.call(summarise_transcript, summarization_args)

summary_tree <- do.call(summarise_full_meeting, summarization_args)
return_vec <- c("transcript_data", "formatted_summary")

} else {

## Format summary tree ##
# Summarize as multiple talks

if (overwrite_formatted_output || !file.exists(formatted_output_file)) {
# Necessary extra arguments for the summarization of whole events
summarization_args$agenda <- agenda
summarization_args$overwrite <- overwrite_summary_tree
summarization_args$output_file <- summarization_output_file
summarization_args$event_start_time <- event_start_time

summary_tree <- do.call(summarise_full_meeting, summarization_args)

## Format summary tree ##
message("\n### Formatting summary tree...\n")

formatted_summary <- format_summary_tree(
summary_tree = summary_tree,
agenda = agenda,
event_start_time = event_start_time,
output_file = formatted_output_file)
output_file = NULL)

} else {
message("\n### Loading existing formatted summary...\n")
formatted_summary <- readr::read_file(formatted_output_file)
return_vec <- c("transcript_data", "summary_tree", "formatted_summary")
}

mget(c("transcript_data", "summary_tree", "formatted_summary"))
message("\n### Writing to file...\n")

readr::write_lines(formatted_summary, formatted_output_file)

return(mget(return_vec))
}
12 changes: 10 additions & 2 deletions R/prompts.R
Original file line number Diff line number Diff line change
Expand Up @@ -646,11 +646,19 @@ The transcript is formatted as a csv with the start and end time of each segment
title = "The talk title",
type = "A label to define the talk",
description = "A description of this talk",
speakers = ["speaker 1", "speaker 2"],
moderators = ["moderator 1"] # If detectable, otherwise ignore this field
speakers = ["a list of speakers"] # If detectable, otherwise ignore this,
moderators = ["a list of moderatora"] # If detectable/appropriate, otherwise ignore this field
}
###',

if (!is.null(args$expected_agenda_element)) {
paste("The event expected agenda is the following, so try to match the extracted talk to this structure. But feel free to describe a novel element if you cannot find a logical match, since there could have been unexpected changes in the agenda: ###\n",
args$expected_agenda_element,
"\n###")
},

"Provide your output.",

sep = "\n\n"
)
}
Loading

0 comments on commit 5311889

Please # to comment.