From 69e4f5e59518da51d7f757a5076511d4224c6d65 Mon Sep 17 00:00:00 2001
From: Angelo D'Ambrosio <bakaburg1@gmail.com>
Date: Fri, 11 Oct 2024 18:45:09 +0200
Subject: [PATCH] feat(stt): add support for local Whisper models

- Added `use_whisper_local_stt` function to support local Whisper models via Python with reticulate.
- Added `use_mlx_whisper_local_stt` function for MLX Whisper models, optimized for Mac OS with Apple Silicon.
- Updated `perform_speech_to_text` to use `whisper_local` as the default model.
- Enhanced `speech_to_summary_workflow` to display the selected speech-to-text model.
- Updated documentation and NAMESPACE to export the new functions.
- Added `reticulate` to the Suggests field in DESCRIPTION for Python integration.
---
 DESCRIPTION                      |   1 +
 NAMESPACE                        |   2 +
 R/data_management.R              |   1 +
 R/speech_to_text.R               | 226 +++++++++++++++++++++++++++++--
 man/perform_speech_to_text.Rd    |   2 +-
 man/use_mlx_whisper_local_stt.Rd |  37 +++++
 man/use_whisper_local_stt.Rd     |  37 +++++
 7 files changed, 291 insertions(+), 15 deletions(-)
 create mode 100644 man/use_mlx_whisper_local_stt.Rd
 create mode 100644 man/use_whisper_local_stt.Rd

diff --git a/DESCRIPTION b/DESCRIPTION
index e54aa4f..f3f5bfe 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -30,6 +30,7 @@ Suggests:
     av (>= 0.9.0),
     devtools (>= 2.4.5),
     parallel (>= 4.3.2),
+    reticulate (>= 1.38.0),
     testthat (>= 3.0.0),
     text2vec (>= 0.6.4),
     tictoc (>= 1.2),
diff --git a/NAMESPACE b/NAMESPACE
index c72e775..2e20e68 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -19,6 +19,8 @@ export(speech_to_summary_workflow)
 export(split_audio)
 export(summarise_full_meeting)
 export(summarise_transcript)
+export(use_mlx_whisper_local_stt)
+export(use_whisper_local_stt)
 export(validate_agenda)
 import(dplyr)
 importFrom(stats,setNames)
diff --git a/R/data_management.R b/R/data_management.R
index c0f6ac7..3d33d9f 100644
--- a/R/data_management.R
+++ b/R/data_management.R
@@ -1239,6 +1239,7 @@ speech_to_summary_workflow <- function(
   ) {
 
     message("\n### Performing speech to text...\n")
+    message("(stt model: ", stt_model, ")\n")
 
     # A speech-to-text model is required
     if (is.null(stt_model)) {
diff --git a/R/speech_to_text.R b/R/speech_to_text.R
index 21be5ea..84f594a 100644
--- a/R/speech_to_text.R
+++ b/R/speech_to_text.R
@@ -33,7 +33,7 @@
 perform_speech_to_text <- function(
     audio_path,
     output_dir = file.path(dirname(audio_path), "transcription_output_data"),
-    model,
+    model = getOption("minutemaker_stt_model", "whisper_local"),
     initial_prompt = NULL, overwrite = FALSE,
     language = "en",
     ...
@@ -395,25 +395,25 @@ use_azure_whisper_stt <- function(
     warning("Error ", response$status_code, " in Azure Whisper API request: ",
             httr::content(response, "text"), call. = FALSE, immediate. = TRUE)
 
-      wait_for <- stringr::str_extract(
-        httr::content(response, "text", encoding = "UTF-8"),
-        "\\d+(?= seconds)") |> as.numeric()
+    wait_for <- stringr::str_extract(
+      httr::content(response, "text", encoding = "UTF-8"),
+      "\\d+(?= seconds)") |> as.numeric()
 
-      if (is.na(wait_for) && !interactive()) stop()
+    if (is.na(wait_for) && !interactive()) stop()
 
-      if (is.na(wait_for)) wait_for <- 30
+    if (is.na(wait_for)) wait_for <- 30
 
-      message("Retrying in ", wait_for, " seconds...")
+    message("Retrying in ", wait_for, " seconds...")
 
-      Sys.sleep(wait_for)
+    Sys.sleep(wait_for)
 
-      res <- use_azure_whisper_stt(
-        audio_file = audio_file,
-        language = language,
-        initial_prompt = initial_prompt,
-        temperature = temperature)
+    res <- use_azure_whisper_stt(
+      audio_file = audio_file,
+      language = language,
+      initial_prompt = initial_prompt,
+      temperature = temperature)
 
-      return(res)
+    return(res)
   }
 
   # Return the response
@@ -489,3 +489,201 @@ use_openai_whisper_stt <- function(
   # Return the response
   res <- httr::content(response)
 }
+
+#' Use Local Whisper Model for Speech-to-Text
+#'
+#' This function uses a local Whisper model via Python with reticulate to
+#' transcribe audio. It can use the official OpenAI Whisper package or any
+#' compatible Python package.
+#'
+#' @param audio_file The path to the audio file to transcribe.
+#' @param language The language of the input audio. Default is "en" for English.
+#'   If NULL, Whisper will attempt to detect the language.
+#' @param initial_prompt Text to guide the model's style or continue a previous
+#'   segment.
+#' @param model The Whisper model to use. Default is "turbo". Check
+#'   https://github.com/openai/whisper for other available models.
+#' @param whisper_package The Python package to use for Whisper (default:
+#'   "openai-whisper").
+#'
+#' @return A list with the full transcript and the transcription by segments.
+#'
+#' @export
+use_whisper_local_stt <- function(
+    audio_file,
+    language = "en",
+    initial_prompt = "",
+    model = "turbo",
+    whisper_package = getOption(
+      "minutemaker_whisper_package", "openai-whisper")
+) {
+  # Check if reticulate is installed
+  if (!rlang::is_installed("reticulate")) {
+    stop("Package 'reticulate' is required. ",
+         "Please install it using install.packages('reticulate')")
+  }
+
+  # Check if Miniconda is installed
+  if (length(list.files(reticulate::miniconda_path())) == 0) {
+    message("Miniconda not found. Installing it now...")
+    reticulate::install_miniconda()
+  }
+
+  conda_env <- "minutemaker_env"
+
+  # Check if the conda environment exists
+  if (!reticulate::condaenv_exists(conda_env)) {
+    message(
+      "Conda environment '", conda_env, "' does not exist. Creating it now...")
+
+    reticulate::conda_create(conda_env, python_version = "3.9")
+  }
+
+  # Use the conda environment
+  reticulate::use_miniconda(conda_env, required = TRUE)
+
+  # Check if Whisper is already installed
+  if (!reticulate::py_module_available("whisper")) {
+    message("Whisper not found. Installing dependencies...")
+
+    # Install the required packages
+    reticulate::conda_install(
+      conda_env,
+      c("numpy==1.23.5", "numba==0.56.4", "llvmlite==0.39.1", whisper_package),
+      pip = TRUE)
+  }
+
+  # Import the Whisper module
+  whisper <- reticulate::import("whisper")
+
+  # Load the Whisper model
+  model <- whisper$load_model(model)
+
+  # Prepare transcription options
+  options <- list(
+    language = language,
+    initial_prompt = initial_prompt,
+    fp16 = FALSE
+  )
+
+  # Remove NULL values from options
+  options <- options[!sapply(options, is.null)]
+
+  # Perform transcription
+  result <- do.call(model$transcribe, c(list(audio_file), options))
+
+  # Extract segments
+  segments <- lapply(result$segments, function(seg) {
+    list(
+      id = seg$id,
+      start = seg$start,
+      end = seg$end,
+      text = seg$text
+    )
+  })
+
+  # Return results in the expected format
+  list(
+    text = result$text,
+    segments = segments
+  )
+}
+
+#' Use MLX Whisper Local Model for Speech-to-Text (Mac OS only)
+#'
+#' This function uses a local MLX Whisper model via Python with reticulate to
+#' transcribe audio. It is specifically designed to work with the MLX Whisper
+#' package. MLX allows faster inference on Mac OS with Apple Silicon.
+#'
+#' @param audio_file The path to the audio file to transcribe.
+#' @param language The language of the input audio. Default is "en" for English.
+#'   If NULL, Whisper will attempt to detect the language.
+#' @param initial_prompt Text to guide the model's style or continue a previous
+#'   segment.
+#' @param model The MLX Whisper model to use. Default is
+#'   "mlx-community/whisper-large-v3-turbo".
+#' @param whisper_package The Python package to use for MLX Whisper (default:
+#'   "mlx_whisper").
+#'
+#' @return A list with the full transcript and the transcription by segments.
+#'
+#' @export
+use_mlx_whisper_local_stt <- function(
+    audio_file,
+    language = "en",
+    initial_prompt = "",
+    model = "mlx-community/distil-whisper-large-v3",
+    whisper_package = getOption("minutemaker_whisper_package", "mlx_whisper")
+) {
+  # Check if reticulate is installed
+  if (!rlang::is_installed("reticulate")) {
+    stop("Package 'reticulate' is required. ",
+         "Please install it using install.packages('reticulate')")
+  }
+
+  # Check if Miniconda is installed
+  if (length(list.files(reticulate::miniconda_path())) == 0) {
+    message("Miniconda not found. Installing it now...")
+    reticulate::install_miniconda()
+  }
+
+  conda_env <- "minutemaker_env"
+
+  # Check if the conda environment exists
+  if (!reticulate::condaenv_exists(conda_env)) {
+    message(
+      "Conda environment '", conda_env, "' does not exist. Creating it now...")
+
+    reticulate::conda_create(conda_env, python_version = "3.9")
+  }
+
+  # Use the conda environment
+  reticulate::use_condaenv(conda_env, required = TRUE)
+
+  # Check if Whisper is already installed
+  if (!reticulate::py_module_available(whisper_package)) {
+    message("Whisper not found. Installing dependencies...")
+
+    # Install the required packages reticulate::conda_install(conda_env,
+    # c("numpy==1.23.5", "numba==0.56.4", "llvmlite==0.39.1", whisper_package),
+    # pip = TRUE)
+    reticulate::conda_install(conda_env, whisper_package, pip = TRUE)
+  }
+
+  # Import the Whisper module
+  mlx_whisper <- reticulate::import(whisper_package)
+
+  # Prepare transcription options
+  decode_options <- list(
+    language = language,
+    initial_prompt = initial_prompt
+  )
+
+  # Remove NULL values from options
+  decode_options <- decode_options[!sapply(decode_options, is.null)]
+
+  # Perform transcription
+  result <- mlx_whisper$transcribe(
+    audio_file,
+    path_or_hf_repo = model,
+    fp16 = FALSE,
+    word_timestamps = TRUE,
+    !!!decode_options
+  )
+
+  # Extract segments
+  segments <- lapply(result$segments, function(seg) {
+    list(
+      id = seg$id,
+      start = seg$start,
+      end = seg$end,
+      text = seg$text
+    )
+  })
+
+  # Return results in the expected format
+  list(
+    text = result$text,
+    segments = segments
+  )
+}
diff --git a/man/perform_speech_to_text.Rd b/man/perform_speech_to_text.Rd
index f8cd860..32a8e45 100644
--- a/man/perform_speech_to_text.Rd
+++ b/man/perform_speech_to_text.Rd
@@ -7,7 +7,7 @@
 perform_speech_to_text(
   audio_path,
   output_dir = file.path(dirname(audio_path), "transcription_output_data"),
-  model,
+  model = getOption("minutemaker_stt_model", "whisper_local"),
   initial_prompt = NULL,
   overwrite = FALSE,
   language = "en",
diff --git a/man/use_mlx_whisper_local_stt.Rd b/man/use_mlx_whisper_local_stt.Rd
new file mode 100644
index 0000000..0f6967a
--- /dev/null
+++ b/man/use_mlx_whisper_local_stt.Rd
@@ -0,0 +1,37 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/speech_to_text.R
+\name{use_mlx_whisper_local_stt}
+\alias{use_mlx_whisper_local_stt}
+\title{Use MLX Whisper Local Model for Speech-to-Text (Mac OS only)}
+\usage{
+use_mlx_whisper_local_stt(
+  audio_file,
+  language = "en",
+  initial_prompt = "",
+  model = "mlx-community/distil-whisper-large-v3",
+  whisper_package = getOption("minutemaker_whisper_package", "mlx_whisper")
+)
+}
+\arguments{
+\item{audio_file}{The path to the audio file to transcribe.}
+
+\item{language}{The language of the input audio. Default is "en" for English.
+If NULL, Whisper will attempt to detect the language.}
+
+\item{initial_prompt}{Text to guide the model's style or continue a previous
+segment.}
+
+\item{model}{The MLX Whisper model to use. Default is
+"mlx-community/whisper-large-v3-turbo".}
+
+\item{whisper_package}{The Python package to use for MLX Whisper (default:
+"mlx_whisper").}
+}
+\value{
+A list with the full transcript and the transcription by segments.
+}
+\description{
+This function uses a local MLX Whisper model via Python with reticulate to
+transcribe audio. It is specifically designed to work with the MLX Whisper
+package. MLX allows faster inference on Mac OS with Apple Silicon.
+}
diff --git a/man/use_whisper_local_stt.Rd b/man/use_whisper_local_stt.Rd
new file mode 100644
index 0000000..0d84ff7
--- /dev/null
+++ b/man/use_whisper_local_stt.Rd
@@ -0,0 +1,37 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/speech_to_text.R
+\name{use_whisper_local_stt}
+\alias{use_whisper_local_stt}
+\title{Use Local Whisper Model for Speech-to-Text}
+\usage{
+use_whisper_local_stt(
+  audio_file,
+  language = "en",
+  initial_prompt = "",
+  model = "turbo",
+  whisper_package = getOption("minutemaker_whisper_package", "openai-whisper")
+)
+}
+\arguments{
+\item{audio_file}{The path to the audio file to transcribe.}
+
+\item{language}{The language of the input audio. Default is "en" for English.
+If NULL, Whisper will attempt to detect the language.}
+
+\item{initial_prompt}{Text to guide the model's style or continue a previous
+segment.}
+
+\item{model}{The Whisper model to use. Default is "turbo". Check
+https://github.com/openai/whisper for other available models.}
+
+\item{whisper_package}{The Python package to use for Whisper (default:
+"openai-whisper").}
+}
+\value{
+A list with the full transcript and the transcription by segments.
+}
+\description{
+This function uses a local Whisper model via Python with reticulate to
+transcribe audio. It can use the official OpenAI Whisper package or any
+compatible Python package.
+}