From cfa96733d86879ca4977c65a8d8b58eace108af2 Mon Sep 17 00:00:00 2001 From: Angelo D'Ambrosio Date: Thu, 11 Apr 2024 11:33:33 +0200 Subject: [PATCH] feat: importation of MS Teams vtt files MS doesn't follow the vtt specification. of course... --- R/data_management.R | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/R/data_management.R b/R/data_management.R index 2e1fc24..ae92034 100644 --- a/R/data_management.R +++ b/R/data_management.R @@ -623,7 +623,10 @@ import_transcript_from_file <- function( as.numeric() # Extract the text - text <- lines[.x + 1] + text <- lines[.x + 1] |> + # MS Teams vtt has an xml tag with speaker info + stringr::str_remove_all("") |> + stringr::str_squish() # Return the data extract <- data.frame( @@ -633,12 +636,23 @@ import_transcript_from_file <- function( ) # If the transcript is diarized, extract the speaker - if (import_diarization && lines[.x - 1] != "") { + if (import_diarization) { # Extract the speaker - cur_speaker <- lines[.x - 1] |> + cur_speaker <- lines[.x - 1] + + # Normal VTT style + if (stringr::str_detect(cur_speaker, "^\\d+ ")) { # the name is between double quotes stringr::str_extract_all('(?<=").*(?=")') |> unlist() + } else if (stringr::str_detect(lines[.x + 1], "^ + cur_speaker <- stringr::str_extract_all( + lines[.x + 1], '(?<=)') |> + unlist() + } else { + cur_speaker <- NA + } # In the unlikely case that there are multiple speakers, join them if (length(cur_speaker) > 1) {