From 3b18519190ae185022ee7023ac02aae9b77c7148 Mon Sep 17 00:00:00 2001 From: Angelo D'Ambrosio Date: Fri, 8 Mar 2024 15:21:43 +0100 Subject: [PATCH 01/17] build: use rev for dev reproducibility --- .Rprofile | 2 +- renv.lock | 528 ++++++++++++++++++++ renv/.gitignore | 7 + renv/activate.R | 1180 ++++++++++++++++++++++++++++++++++++++++++++ renv/settings.json | 19 + 5 files changed, 1735 insertions(+), 1 deletion(-) create mode 100644 renv.lock create mode 100644 renv/.gitignore create mode 100644 renv/activate.R create mode 100644 renv/settings.json diff --git a/.Rprofile b/.Rprofile index 8b13789..81b960f 100644 --- a/.Rprofile +++ b/.Rprofile @@ -1 +1 @@ - +source("renv/activate.R") diff --git a/renv.lock b/renv.lock new file mode 100644 index 0000000..1bd4e45 --- /dev/null +++ b/renv.lock @@ -0,0 +1,528 @@ +{ + "R": { + "Version": "4.3.2", + "Repositories": [ + { + "Name": "CRAN", + "URL": "https://packagemanager.posit.co/cran/latest" + } + ] + }, + "Packages": { + "R6": { + "Package": "R6", + "Version": "2.5.1", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R" + ], + "Hash": "470851b6d5d0ac559e9d01bb352b4021" + }, + "askpass": { + "Package": "askpass", + "Version": "1.2.0", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "sys" + ], + "Hash": "cad6cf7f1d5f6e906700b9d3e718c796" + }, + "bit": { + "Package": "bit", + "Version": "4.0.5", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R" + ], + "Hash": "d242abec29412ce988848d0294b208fd" + }, + "bit64": { + "Package": "bit64", + "Version": "4.0.5", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "bit", + "methods", + "stats", + "utils" + ], + "Hash": "9fe98599ca456d6552421db0d6772d8f" + }, + "cli": { + "Package": "cli", + "Version": "3.6.2", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "utils" + ], + "Hash": "1216ac65ac55ec0058a6f75d7ca0fd52" + }, + "clipr": { + "Package": "clipr", + "Version": "0.8.0", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "utils" + ], + "Hash": "3f038e5ac7f41d4ac41ce658c85e3042" + }, + "cpp11": { + "Package": "cpp11", + "Version": "0.4.7", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R" + ], + "Hash": "5a295d7d963cc5035284dcdbaf334f4e" + }, + "crayon": { + "Package": "crayon", + "Version": "1.5.2", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "grDevices", + "methods", + "utils" + ], + "Hash": "e8a1e41acf02548751f45c718d55aa6a" + }, + "curl": { + "Package": "curl", + "Version": "5.2.1", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R" + ], + "Hash": "411ca2c03b1ce5f548345d2fc2685f7a" + }, + "dplyr": { + "Package": "dplyr", + "Version": "1.1.4", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "R6", + "cli", + "generics", + "glue", + "lifecycle", + "magrittr", + "methods", + "pillar", + "rlang", + "tibble", + "tidyselect", + "utils", + "vctrs" + ], + "Hash": "fedd9d00c2944ff00a0e2696ccf048ec" + }, + "fansi": { + "Package": "fansi", + "Version": "1.0.6", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "grDevices", + "utils" + ], + "Hash": "962174cf2aeb5b9eea581522286a911f" + }, + "generics": { + "Package": "generics", + "Version": "0.1.3", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "methods" + ], + "Hash": "15e9634c0fcd294799e9b2e929ed1b86" + }, + "glue": { + "Package": "glue", + "Version": "1.7.0", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "methods" + ], + "Hash": "e0b3a53876554bd45879e596cdb10a52" + }, + "hms": { + "Package": "hms", + "Version": "1.1.3", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "lifecycle", + "methods", + "pkgconfig", + "rlang", + "vctrs" + ], + "Hash": "b59377caa7ed00fa41808342002138f9" + }, + "httr": { + "Package": "httr", + "Version": "1.4.7", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "R6", + "curl", + "jsonlite", + "mime", + "openssl" + ], + "Hash": "ac107251d9d9fd72f0ca8049988f1d7f" + }, + "jsonlite": { + "Package": "jsonlite", + "Version": "1.8.8", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "methods" + ], + "Hash": "e1b9c55281c5adc4dd113652d9e26768" + }, + "lifecycle": { + "Package": "lifecycle", + "Version": "1.0.4", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "cli", + "glue", + "rlang" + ], + "Hash": "b8552d117e1b808b09a832f589b79035" + }, + "lubridate": { + "Package": "lubridate", + "Version": "1.9.3", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "generics", + "methods", + "timechange" + ], + "Hash": "680ad542fbcf801442c83a6ac5a2126c" + }, + "magrittr": { + "Package": "magrittr", + "Version": "2.0.3", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R" + ], + "Hash": "7ce2733a9826b3aeb1775d56fd305472" + }, + "mime": { + "Package": "mime", + "Version": "0.12", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "tools" + ], + "Hash": "18e9c28c1d3ca1560ce30658b22ce104" + }, + "openssl": { + "Package": "openssl", + "Version": "2.1.1", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "askpass" + ], + "Hash": "2a0dc8c6adfb6f032e4d4af82d258ab5" + }, + "pillar": { + "Package": "pillar", + "Version": "1.9.0", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "cli", + "fansi", + "glue", + "lifecycle", + "rlang", + "utf8", + "utils", + "vctrs" + ], + "Hash": "15da5a8412f317beeee6175fbc76f4bb" + }, + "pkgconfig": { + "Package": "pkgconfig", + "Version": "2.0.3", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "utils" + ], + "Hash": "01f28d4278f15c76cddbea05899c5d6f" + }, + "prettyunits": { + "Package": "prettyunits", + "Version": "1.2.0", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R" + ], + "Hash": "6b01fc98b1e86c4f705ce9dcfd2f57c7" + }, + "progress": { + "Package": "progress", + "Version": "1.2.3", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "R6", + "crayon", + "hms", + "prettyunits" + ], + "Hash": "f4625e061cb2865f111b47ff163a5ca6" + }, + "purrr": { + "Package": "purrr", + "Version": "1.0.2", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "cli", + "lifecycle", + "magrittr", + "rlang", + "vctrs" + ], + "Hash": "1cba04a4e9414bdefc9dcaa99649a8dc" + }, + "readr": { + "Package": "readr", + "Version": "2.1.5", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "R6", + "cli", + "clipr", + "cpp11", + "crayon", + "hms", + "lifecycle", + "methods", + "rlang", + "tibble", + "tzdb", + "utils", + "vroom" + ], + "Hash": "9de96463d2117f6ac49980577939dfb3" + }, + "renv": { + "Package": "renv", + "Version": "1.0.3", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "utils" + ], + "Hash": "41b847654f567341725473431dd0d5ab" + }, + "rlang": { + "Package": "rlang", + "Version": "1.1.3", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "utils" + ], + "Hash": "42548638fae05fd9a9b5f3f437fbbbe2" + }, + "stringi": { + "Package": "stringi", + "Version": "1.8.3", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "stats", + "tools", + "utils" + ], + "Hash": "058aebddea264f4c99401515182e656a" + }, + "stringr": { + "Package": "stringr", + "Version": "1.5.1", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "cli", + "glue", + "lifecycle", + "magrittr", + "rlang", + "stringi", + "vctrs" + ], + "Hash": "960e2ae9e09656611e0b8214ad543207" + }, + "sys": { + "Package": "sys", + "Version": "3.4.2", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "3a1be13d68d47a8cd0bfd74739ca1555" + }, + "tibble": { + "Package": "tibble", + "Version": "3.2.1", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "fansi", + "lifecycle", + "magrittr", + "methods", + "pillar", + "pkgconfig", + "rlang", + "utils", + "vctrs" + ], + "Hash": "a84e2cc86d07289b3b6f5069df7a004c" + }, + "tidyselect": { + "Package": "tidyselect", + "Version": "1.2.0", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "cli", + "glue", + "lifecycle", + "rlang", + "vctrs", + "withr" + ], + "Hash": "79540e5fcd9e0435af547d885f184fd5" + }, + "timechange": { + "Package": "timechange", + "Version": "0.3.0", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "cpp11" + ], + "Hash": "c5f3c201b931cd6474d17d8700ccb1c8" + }, + "tzdb": { + "Package": "tzdb", + "Version": "0.4.0", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "cpp11" + ], + "Hash": "f561504ec2897f4d46f0c7657e488ae1" + }, + "utf8": { + "Package": "utf8", + "Version": "1.2.4", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R" + ], + "Hash": "62b65c52671e6665f803ff02954446e9" + }, + "vctrs": { + "Package": "vctrs", + "Version": "0.6.5", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "cli", + "glue", + "lifecycle", + "rlang" + ], + "Hash": "c03fa420630029418f7e6da3667aac4a" + }, + "vroom": { + "Package": "vroom", + "Version": "1.6.5", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "bit64", + "cli", + "cpp11", + "crayon", + "glue", + "hms", + "lifecycle", + "methods", + "progress", + "rlang", + "stats", + "tibble", + "tidyselect", + "tzdb", + "vctrs", + "withr" + ], + "Hash": "390f9315bc0025be03012054103d227c" + }, + "withr": { + "Package": "withr", + "Version": "3.0.0", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "grDevices", + "graphics" + ], + "Hash": "d31b6c62c10dcf11ec530ca6b0dd5d35" + } + } +} diff --git a/renv/.gitignore b/renv/.gitignore new file mode 100644 index 0000000..0ec0cbb --- /dev/null +++ b/renv/.gitignore @@ -0,0 +1,7 @@ +library/ +local/ +cellar/ +lock/ +python/ +sandbox/ +staging/ diff --git a/renv/activate.R b/renv/activate.R new file mode 100644 index 0000000..cb5401f --- /dev/null +++ b/renv/activate.R @@ -0,0 +1,1180 @@ + +local({ + + # the requested version of renv + version <- "1.0.3" + attr(version, "sha") <- NULL + + # the project directory + project <- getwd() + + # use start-up diagnostics if enabled + diagnostics <- Sys.getenv("RENV_STARTUP_DIAGNOSTICS", unset = "FALSE") + if (diagnostics) { + start <- Sys.time() + profile <- tempfile("renv-startup-", fileext = ".Rprof") + utils::Rprof(profile) + on.exit({ + utils::Rprof(NULL) + elapsed <- signif(difftime(Sys.time(), start, units = "auto"), digits = 2L) + writeLines(sprintf("- renv took %s to run the autoloader.", format(elapsed))) + writeLines(sprintf("- Profile: %s", profile)) + print(utils::summaryRprof(profile)) + }, add = TRUE) + } + + # figure out whether the autoloader is enabled + enabled <- local({ + + # first, check config option + override <- getOption("renv.config.autoloader.enabled") + if (!is.null(override)) + return(override) + + # next, check environment variables + # TODO: prefer using the configuration one in the future + envvars <- c( + "RENV_CONFIG_AUTOLOADER_ENABLED", + "RENV_AUTOLOADER_ENABLED", + "RENV_ACTIVATE_PROJECT" + ) + + for (envvar in envvars) { + envval <- Sys.getenv(envvar, unset = NA) + if (!is.na(envval)) + return(tolower(envval) %in% c("true", "t", "1")) + } + + # enable by default + TRUE + + }) + + if (!enabled) + return(FALSE) + + # avoid recursion + if (identical(getOption("renv.autoloader.running"), TRUE)) { + warning("ignoring recursive attempt to run renv autoloader") + return(invisible(TRUE)) + } + + # signal that we're loading renv during R startup + options(renv.autoloader.running = TRUE) + on.exit(options(renv.autoloader.running = NULL), add = TRUE) + + # signal that we've consented to use renv + options(renv.consent = TRUE) + + # load the 'utils' package eagerly -- this ensures that renv shims, which + # mask 'utils' packages, will come first on the search path + library(utils, lib.loc = .Library) + + # unload renv if it's already been loaded + if ("renv" %in% loadedNamespaces()) + unloadNamespace("renv") + + # load bootstrap tools + `%||%` <- function(x, y) { + if (is.null(x)) y else x + } + + catf <- function(fmt, ..., appendLF = TRUE) { + + quiet <- getOption("renv.bootstrap.quiet", default = FALSE) + if (quiet) + return(invisible()) + + msg <- sprintf(fmt, ...) + cat(msg, file = stdout(), sep = if (appendLF) "\n" else "") + + invisible(msg) + + } + + header <- function(label, + ..., + prefix = "#", + suffix = "-", + n = min(getOption("width"), 78)) + { + label <- sprintf(label, ...) + n <- max(n - nchar(label) - nchar(prefix) - 2L, 8L) + if (n <= 0) + return(paste(prefix, label)) + + tail <- paste(rep.int(suffix, n), collapse = "") + paste0(prefix, " ", label, " ", tail) + + } + + startswith <- function(string, prefix) { + substring(string, 1, nchar(prefix)) == prefix + } + + bootstrap <- function(version, library) { + + friendly <- renv_bootstrap_version_friendly(version) + section <- header(sprintf("Bootstrapping renv %s", friendly)) + catf(section) + + # attempt to download renv + catf("- Downloading renv ... ", appendLF = FALSE) + withCallingHandlers( + tarball <- renv_bootstrap_download(version), + error = function(err) { + catf("FAILED") + stop("failed to download:\n", conditionMessage(err)) + } + ) + catf("OK") + on.exit(unlink(tarball), add = TRUE) + + # now attempt to install + catf("- Installing renv ... ", appendLF = FALSE) + withCallingHandlers( + status <- renv_bootstrap_install(version, tarball, library), + error = function(err) { + catf("FAILED") + stop("failed to install:\n", conditionMessage(err)) + } + ) + catf("OK") + + # add empty line to break up bootstrapping from normal output + catf("") + + return(invisible()) + } + + renv_bootstrap_tests_running <- function() { + getOption("renv.tests.running", default = FALSE) + } + + renv_bootstrap_repos <- function() { + + # get CRAN repository + cran <- getOption("renv.repos.cran", "https://cloud.r-project.org") + + # check for repos override + repos <- Sys.getenv("RENV_CONFIG_REPOS_OVERRIDE", unset = NA) + if (!is.na(repos)) { + + # check for RSPM; if set, use a fallback repository for renv + rspm <- Sys.getenv("RSPM", unset = NA) + if (identical(rspm, repos)) + repos <- c(RSPM = rspm, CRAN = cran) + + return(repos) + + } + + # check for lockfile repositories + repos <- tryCatch(renv_bootstrap_repos_lockfile(), error = identity) + if (!inherits(repos, "error") && length(repos)) + return(repos) + + # retrieve current repos + repos <- getOption("repos") + + # ensure @CRAN@ entries are resolved + repos[repos == "@CRAN@"] <- cran + + # add in renv.bootstrap.repos if set + default <- c(FALLBACK = "https://cloud.r-project.org") + extra <- getOption("renv.bootstrap.repos", default = default) + repos <- c(repos, extra) + + # remove duplicates that might've snuck in + dupes <- duplicated(repos) | duplicated(names(repos)) + repos[!dupes] + + } + + renv_bootstrap_repos_lockfile <- function() { + + lockpath <- Sys.getenv("RENV_PATHS_LOCKFILE", unset = "renv.lock") + if (!file.exists(lockpath)) + return(NULL) + + lockfile <- tryCatch(renv_json_read(lockpath), error = identity) + if (inherits(lockfile, "error")) { + warning(lockfile) + return(NULL) + } + + repos <- lockfile$R$Repositories + if (length(repos) == 0) + return(NULL) + + keys <- vapply(repos, `[[`, "Name", FUN.VALUE = character(1)) + vals <- vapply(repos, `[[`, "URL", FUN.VALUE = character(1)) + names(vals) <- keys + + return(vals) + + } + + renv_bootstrap_download <- function(version) { + + sha <- attr(version, "sha", exact = TRUE) + + methods <- if (!is.null(sha)) { + + # attempting to bootstrap a development version of renv + c( + function() renv_bootstrap_download_tarball(sha), + function() renv_bootstrap_download_github(sha) + ) + + } else { + + # attempting to bootstrap a release version of renv + c( + function() renv_bootstrap_download_tarball(version), + function() renv_bootstrap_download_cran_latest(version), + function() renv_bootstrap_download_cran_archive(version) + ) + + } + + for (method in methods) { + path <- tryCatch(method(), error = identity) + if (is.character(path) && file.exists(path)) + return(path) + } + + stop("All download methods failed") + + } + + renv_bootstrap_download_impl <- function(url, destfile) { + + mode <- "wb" + + # https://bugs.r-project.org/bugzilla/show_bug.cgi?id=17715 + fixup <- + Sys.info()[["sysname"]] == "Windows" && + substring(url, 1L, 5L) == "file:" + + if (fixup) + mode <- "w+b" + + args <- list( + url = url, + destfile = destfile, + mode = mode, + quiet = TRUE + ) + + if ("headers" %in% names(formals(utils::download.file))) + args$headers <- renv_bootstrap_download_custom_headers(url) + + do.call(utils::download.file, args) + + } + + renv_bootstrap_download_custom_headers <- function(url) { + + headers <- getOption("renv.download.headers") + if (is.null(headers)) + return(character()) + + if (!is.function(headers)) + stopf("'renv.download.headers' is not a function") + + headers <- headers(url) + if (length(headers) == 0L) + return(character()) + + if (is.list(headers)) + headers <- unlist(headers, recursive = FALSE, use.names = TRUE) + + ok <- + is.character(headers) && + is.character(names(headers)) && + all(nzchar(names(headers))) + + if (!ok) + stop("invocation of 'renv.download.headers' did not return a named character vector") + + headers + + } + + renv_bootstrap_download_cran_latest <- function(version) { + + spec <- renv_bootstrap_download_cran_latest_find(version) + type <- spec$type + repos <- spec$repos + + baseurl <- utils::contrib.url(repos = repos, type = type) + ext <- if (identical(type, "source")) + ".tar.gz" + else if (Sys.info()[["sysname"]] == "Windows") + ".zip" + else + ".tgz" + name <- sprintf("renv_%s%s", version, ext) + url <- paste(baseurl, name, sep = "/") + + destfile <- file.path(tempdir(), name) + status <- tryCatch( + renv_bootstrap_download_impl(url, destfile), + condition = identity + ) + + if (inherits(status, "condition")) + return(FALSE) + + # report success and return + destfile + + } + + renv_bootstrap_download_cran_latest_find <- function(version) { + + # check whether binaries are supported on this system + binary <- + getOption("renv.bootstrap.binary", default = TRUE) && + !identical(.Platform$pkgType, "source") && + !identical(getOption("pkgType"), "source") && + Sys.info()[["sysname"]] %in% c("Darwin", "Windows") + + types <- c(if (binary) "binary", "source") + + # iterate over types + repositories + for (type in types) { + for (repos in renv_bootstrap_repos()) { + + # retrieve package database + db <- tryCatch( + as.data.frame( + utils::available.packages(type = type, repos = repos), + stringsAsFactors = FALSE + ), + error = identity + ) + + if (inherits(db, "error")) + next + + # check for compatible entry + entry <- db[db$Package %in% "renv" & db$Version %in% version, ] + if (nrow(entry) == 0) + next + + # found it; return spec to caller + spec <- list(entry = entry, type = type, repos = repos) + return(spec) + + } + } + + # if we got here, we failed to find renv + fmt <- "renv %s is not available from your declared package repositories" + stop(sprintf(fmt, version)) + + } + + renv_bootstrap_download_cran_archive <- function(version) { + + name <- sprintf("renv_%s.tar.gz", version) + repos <- renv_bootstrap_repos() + urls <- file.path(repos, "src/contrib/Archive/renv", name) + destfile <- file.path(tempdir(), name) + + for (url in urls) { + + status <- tryCatch( + renv_bootstrap_download_impl(url, destfile), + condition = identity + ) + + if (identical(status, 0L)) + return(destfile) + + } + + return(FALSE) + + } + + renv_bootstrap_download_tarball <- function(version) { + + # if the user has provided the path to a tarball via + # an environment variable, then use it + tarball <- Sys.getenv("RENV_BOOTSTRAP_TARBALL", unset = NA) + if (is.na(tarball)) + return() + + # allow directories + if (dir.exists(tarball)) { + name <- sprintf("renv_%s.tar.gz", version) + tarball <- file.path(tarball, name) + } + + # bail if it doesn't exist + if (!file.exists(tarball)) { + + # let the user know we weren't able to honour their request + fmt <- "- RENV_BOOTSTRAP_TARBALL is set (%s) but does not exist." + msg <- sprintf(fmt, tarball) + warning(msg) + + # bail + return() + + } + + catf("- Using local tarball '%s'.", tarball) + tarball + + } + + renv_bootstrap_download_github <- function(version) { + + enabled <- Sys.getenv("RENV_BOOTSTRAP_FROM_GITHUB", unset = "TRUE") + if (!identical(enabled, "TRUE")) + return(FALSE) + + # prepare download options + pat <- Sys.getenv("GITHUB_PAT") + if (nzchar(Sys.which("curl")) && nzchar(pat)) { + fmt <- "--location --fail --header \"Authorization: token %s\"" + extra <- sprintf(fmt, pat) + saved <- options("download.file.method", "download.file.extra") + options(download.file.method = "curl", download.file.extra = extra) + on.exit(do.call(base::options, saved), add = TRUE) + } else if (nzchar(Sys.which("wget")) && nzchar(pat)) { + fmt <- "--header=\"Authorization: token %s\"" + extra <- sprintf(fmt, pat) + saved <- options("download.file.method", "download.file.extra") + options(download.file.method = "wget", download.file.extra = extra) + on.exit(do.call(base::options, saved), add = TRUE) + } + + url <- file.path("https://api.github.com/repos/rstudio/renv/tarball", version) + name <- sprintf("renv_%s.tar.gz", version) + destfile <- file.path(tempdir(), name) + + status <- tryCatch( + renv_bootstrap_download_impl(url, destfile), + condition = identity + ) + + if (!identical(status, 0L)) + return(FALSE) + + renv_bootstrap_download_augment(destfile) + + return(destfile) + + } + + # Add Sha to DESCRIPTION. This is stop gap until #890, after which we + # can use renv::install() to fully capture metadata. + renv_bootstrap_download_augment <- function(destfile) { + sha <- renv_bootstrap_git_extract_sha1_tar(destfile) + if (is.null(sha)) { + return() + } + + # Untar + tempdir <- tempfile("renv-github-") + on.exit(unlink(tempdir, recursive = TRUE), add = TRUE) + untar(destfile, exdir = tempdir) + pkgdir <- dir(tempdir, full.names = TRUE)[[1]] + + # Modify description + desc_path <- file.path(pkgdir, "DESCRIPTION") + desc_lines <- readLines(desc_path) + remotes_fields <- c( + "RemoteType: github", + "RemoteHost: api.github.com", + "RemoteRepo: renv", + "RemoteUsername: rstudio", + "RemotePkgRef: rstudio/renv", + paste("RemoteRef: ", sha), + paste("RemoteSha: ", sha) + ) + writeLines(c(desc_lines[desc_lines != ""], remotes_fields), con = desc_path) + + # Re-tar + local({ + old <- setwd(tempdir) + on.exit(setwd(old), add = TRUE) + + tar(destfile, compression = "gzip") + }) + invisible() + } + + # Extract the commit hash from a git archive. Git archives include the SHA1 + # hash as the comment field of the tarball pax extended header + # (see https://www.kernel.org/pub/software/scm/git/docs/git-archive.html) + # For GitHub archives this should be the first header after the default one + # (512 byte) header. + renv_bootstrap_git_extract_sha1_tar <- function(bundle) { + + # open the bundle for reading + # We use gzcon for everything because (from ?gzcon) + # > Reading from a connection which does not supply a 'gzip' magic + # > header is equivalent to reading from the original connection + conn <- gzcon(file(bundle, open = "rb", raw = TRUE)) + on.exit(close(conn)) + + # The default pax header is 512 bytes long and the first pax extended header + # with the comment should be 51 bytes long + # `52 comment=` (11 chars) + 40 byte SHA1 hash + len <- 0x200 + 0x33 + res <- rawToChar(readBin(conn, "raw", n = len)[0x201:len]) + + if (grepl("^52 comment=", res)) { + sub("52 comment=", "", res) + } else { + NULL + } + } + + renv_bootstrap_install <- function(version, tarball, library) { + + # attempt to install it into project library + dir.create(library, showWarnings = FALSE, recursive = TRUE) + output <- renv_bootstrap_install_impl(library, tarball) + + # check for successful install + status <- attr(output, "status") + if (is.null(status) || identical(status, 0L)) + return(status) + + # an error occurred; report it + header <- "installation of renv failed" + lines <- paste(rep.int("=", nchar(header)), collapse = "") + text <- paste(c(header, lines, output), collapse = "\n") + stop(text) + + } + + renv_bootstrap_install_impl <- function(library, tarball) { + + # invoke using system2 so we can capture and report output + bin <- R.home("bin") + exe <- if (Sys.info()[["sysname"]] == "Windows") "R.exe" else "R" + R <- file.path(bin, exe) + + args <- c( + "--vanilla", "CMD", "INSTALL", "--no-multiarch", + "-l", shQuote(path.expand(library)), + shQuote(path.expand(tarball)) + ) + + system2(R, args, stdout = TRUE, stderr = TRUE) + + } + + renv_bootstrap_platform_prefix <- function() { + + # construct version prefix + version <- paste(R.version$major, R.version$minor, sep = ".") + prefix <- paste("R", numeric_version(version)[1, 1:2], sep = "-") + + # include SVN revision for development versions of R + # (to avoid sharing platform-specific artefacts with released versions of R) + devel <- + identical(R.version[["status"]], "Under development (unstable)") || + identical(R.version[["nickname"]], "Unsuffered Consequences") + + if (devel) + prefix <- paste(prefix, R.version[["svn rev"]], sep = "-r") + + # build list of path components + components <- c(prefix, R.version$platform) + + # include prefix if provided by user + prefix <- renv_bootstrap_platform_prefix_impl() + if (!is.na(prefix) && nzchar(prefix)) + components <- c(prefix, components) + + # build prefix + paste(components, collapse = "/") + + } + + renv_bootstrap_platform_prefix_impl <- function() { + + # if an explicit prefix has been supplied, use it + prefix <- Sys.getenv("RENV_PATHS_PREFIX", unset = NA) + if (!is.na(prefix)) + return(prefix) + + # if the user has requested an automatic prefix, generate it + auto <- Sys.getenv("RENV_PATHS_PREFIX_AUTO", unset = NA) + if (auto %in% c("TRUE", "True", "true", "1")) + return(renv_bootstrap_platform_prefix_auto()) + + # empty string on failure + "" + + } + + renv_bootstrap_platform_prefix_auto <- function() { + + prefix <- tryCatch(renv_bootstrap_platform_os(), error = identity) + if (inherits(prefix, "error") || prefix %in% "unknown") { + + msg <- paste( + "failed to infer current operating system", + "please file a bug report at https://github.com/rstudio/renv/issues", + sep = "; " + ) + + warning(msg) + + } + + prefix + + } + + renv_bootstrap_platform_os <- function() { + + sysinfo <- Sys.info() + sysname <- sysinfo[["sysname"]] + + # handle Windows + macOS up front + if (sysname == "Windows") + return("windows") + else if (sysname == "Darwin") + return("macos") + + # check for os-release files + for (file in c("/etc/os-release", "/usr/lib/os-release")) + if (file.exists(file)) + return(renv_bootstrap_platform_os_via_os_release(file, sysinfo)) + + # check for redhat-release files + if (file.exists("/etc/redhat-release")) + return(renv_bootstrap_platform_os_via_redhat_release()) + + "unknown" + + } + + renv_bootstrap_platform_os_via_os_release <- function(file, sysinfo) { + + # read /etc/os-release + release <- utils::read.table( + file = file, + sep = "=", + quote = c("\"", "'"), + col.names = c("Key", "Value"), + comment.char = "#", + stringsAsFactors = FALSE + ) + + vars <- as.list(release$Value) + names(vars) <- release$Key + + # get os name + os <- tolower(sysinfo[["sysname"]]) + + # read id + id <- "unknown" + for (field in c("ID", "ID_LIKE")) { + if (field %in% names(vars) && nzchar(vars[[field]])) { + id <- vars[[field]] + break + } + } + + # read version + version <- "unknown" + for (field in c("UBUNTU_CODENAME", "VERSION_CODENAME", "VERSION_ID", "BUILD_ID")) { + if (field %in% names(vars) && nzchar(vars[[field]])) { + version <- vars[[field]] + break + } + } + + # join together + paste(c(os, id, version), collapse = "-") + + } + + renv_bootstrap_platform_os_via_redhat_release <- function() { + + # read /etc/redhat-release + contents <- readLines("/etc/redhat-release", warn = FALSE) + + # infer id + id <- if (grepl("centos", contents, ignore.case = TRUE)) + "centos" + else if (grepl("redhat", contents, ignore.case = TRUE)) + "redhat" + else + "unknown" + + # try to find a version component (very hacky) + version <- "unknown" + + parts <- strsplit(contents, "[[:space:]]")[[1L]] + for (part in parts) { + + nv <- tryCatch(numeric_version(part), error = identity) + if (inherits(nv, "error")) + next + + version <- nv[1, 1] + break + + } + + paste(c("linux", id, version), collapse = "-") + + } + + renv_bootstrap_library_root_name <- function(project) { + + # use project name as-is if requested + asis <- Sys.getenv("RENV_PATHS_LIBRARY_ROOT_ASIS", unset = "FALSE") + if (asis) + return(basename(project)) + + # otherwise, disambiguate based on project's path + id <- substring(renv_bootstrap_hash_text(project), 1L, 8L) + paste(basename(project), id, sep = "-") + + } + + renv_bootstrap_library_root <- function(project) { + + prefix <- renv_bootstrap_profile_prefix() + + path <- Sys.getenv("RENV_PATHS_LIBRARY", unset = NA) + if (!is.na(path)) + return(paste(c(path, prefix), collapse = "/")) + + path <- renv_bootstrap_library_root_impl(project) + if (!is.null(path)) { + name <- renv_bootstrap_library_root_name(project) + return(paste(c(path, prefix, name), collapse = "/")) + } + + renv_bootstrap_paths_renv("library", project = project) + + } + + renv_bootstrap_library_root_impl <- function(project) { + + root <- Sys.getenv("RENV_PATHS_LIBRARY_ROOT", unset = NA) + if (!is.na(root)) + return(root) + + type <- renv_bootstrap_project_type(project) + if (identical(type, "package")) { + userdir <- renv_bootstrap_user_dir() + return(file.path(userdir, "library")) + } + + } + + renv_bootstrap_validate_version <- function(version, description = NULL) { + + # resolve description file + # + # avoid passing lib.loc to `packageDescription()` below, since R will + # use the loaded version of the package by default anyhow. note that + # this function should only be called after 'renv' is loaded + # https://github.com/rstudio/renv/issues/1625 + description <- description %||% packageDescription("renv") + + # check whether requested version 'version' matches loaded version of renv + sha <- attr(version, "sha", exact = TRUE) + valid <- if (!is.null(sha)) + renv_bootstrap_validate_version_dev(sha, description) + else + renv_bootstrap_validate_version_release(version, description) + + if (valid) + return(TRUE) + + # the loaded version of renv doesn't match the requested version; + # give the user instructions on how to proceed + remote <- if (!is.null(description[["RemoteSha"]])) { + paste("rstudio/renv", description[["RemoteSha"]], sep = "@") + } else { + paste("renv", description[["Version"]], sep = "@") + } + + # display both loaded version + sha if available + friendly <- renv_bootstrap_version_friendly( + version = description[["Version"]], + sha = description[["RemoteSha"]] + ) + + fmt <- paste( + "renv %1$s was loaded from project library, but this project is configured to use renv %2$s.", + "- Use `renv::record(\"%3$s\")` to record renv %1$s in the lockfile.", + "- Use `renv::restore(packages = \"renv\")` to install renv %2$s into the project library.", + sep = "\n" + ) + catf(fmt, friendly, renv_bootstrap_version_friendly(version), remote) + + FALSE + + } + + renv_bootstrap_validate_version_dev <- function(version, description) { + expected <- description[["RemoteSha"]] + is.character(expected) && startswith(expected, version) + } + + renv_bootstrap_validate_version_release <- function(version, description) { + expected <- description[["Version"]] + is.character(expected) && identical(expected, version) + } + + renv_bootstrap_hash_text <- function(text) { + + hashfile <- tempfile("renv-hash-") + on.exit(unlink(hashfile), add = TRUE) + + writeLines(text, con = hashfile) + tools::md5sum(hashfile) + + } + + renv_bootstrap_load <- function(project, libpath, version) { + + # try to load renv from the project library + if (!requireNamespace("renv", lib.loc = libpath, quietly = TRUE)) + return(FALSE) + + # warn if the version of renv loaded does not match + renv_bootstrap_validate_version(version) + + # execute renv load hooks, if any + hooks <- getHook("renv::autoload") + for (hook in hooks) + if (is.function(hook)) + tryCatch(hook(), error = warnify) + + # load the project + renv::load(project) + + TRUE + + } + + renv_bootstrap_profile_load <- function(project) { + + # if RENV_PROFILE is already set, just use that + profile <- Sys.getenv("RENV_PROFILE", unset = NA) + if (!is.na(profile) && nzchar(profile)) + return(profile) + + # check for a profile file (nothing to do if it doesn't exist) + path <- renv_bootstrap_paths_renv("profile", profile = FALSE, project = project) + if (!file.exists(path)) + return(NULL) + + # read the profile, and set it if it exists + contents <- readLines(path, warn = FALSE) + if (length(contents) == 0L) + return(NULL) + + # set RENV_PROFILE + profile <- contents[[1L]] + if (!profile %in% c("", "default")) + Sys.setenv(RENV_PROFILE = profile) + + profile + + } + + renv_bootstrap_profile_prefix <- function() { + profile <- renv_bootstrap_profile_get() + if (!is.null(profile)) + return(file.path("profiles", profile, "renv")) + } + + renv_bootstrap_profile_get <- function() { + profile <- Sys.getenv("RENV_PROFILE", unset = "") + renv_bootstrap_profile_normalize(profile) + } + + renv_bootstrap_profile_set <- function(profile) { + profile <- renv_bootstrap_profile_normalize(profile) + if (is.null(profile)) + Sys.unsetenv("RENV_PROFILE") + else + Sys.setenv(RENV_PROFILE = profile) + } + + renv_bootstrap_profile_normalize <- function(profile) { + + if (is.null(profile) || profile %in% c("", "default")) + return(NULL) + + profile + + } + + renv_bootstrap_path_absolute <- function(path) { + + substr(path, 1L, 1L) %in% c("~", "/", "\\") || ( + substr(path, 1L, 1L) %in% c(letters, LETTERS) && + substr(path, 2L, 3L) %in% c(":/", ":\\") + ) + + } + + renv_bootstrap_paths_renv <- function(..., profile = TRUE, project = NULL) { + renv <- Sys.getenv("RENV_PATHS_RENV", unset = "renv") + root <- if (renv_bootstrap_path_absolute(renv)) NULL else project + prefix <- if (profile) renv_bootstrap_profile_prefix() + components <- c(root, renv, prefix, ...) + paste(components, collapse = "/") + } + + renv_bootstrap_project_type <- function(path) { + + descpath <- file.path(path, "DESCRIPTION") + if (!file.exists(descpath)) + return("unknown") + + desc <- tryCatch( + read.dcf(descpath, all = TRUE), + error = identity + ) + + if (inherits(desc, "error")) + return("unknown") + + type <- desc$Type + if (!is.null(type)) + return(tolower(type)) + + package <- desc$Package + if (!is.null(package)) + return("package") + + "unknown" + + } + + renv_bootstrap_user_dir <- function() { + dir <- renv_bootstrap_user_dir_impl() + path.expand(chartr("\\", "/", dir)) + } + + renv_bootstrap_user_dir_impl <- function() { + + # use local override if set + override <- getOption("renv.userdir.override") + if (!is.null(override)) + return(override) + + # use R_user_dir if available + tools <- asNamespace("tools") + if (is.function(tools$R_user_dir)) + return(tools$R_user_dir("renv", "cache")) + + # try using our own backfill for older versions of R + envvars <- c("R_USER_CACHE_DIR", "XDG_CACHE_HOME") + for (envvar in envvars) { + root <- Sys.getenv(envvar, unset = NA) + if (!is.na(root)) + return(file.path(root, "R/renv")) + } + + # use platform-specific default fallbacks + if (Sys.info()[["sysname"]] == "Windows") + file.path(Sys.getenv("LOCALAPPDATA"), "R/cache/R/renv") + else if (Sys.info()[["sysname"]] == "Darwin") + "~/Library/Caches/org.R-project.R/R/renv" + else + "~/.cache/R/renv" + + } + + renv_bootstrap_version_friendly <- function(version, shafmt = NULL, sha = NULL) { + sha <- sha %||% attr(version, "sha", exact = TRUE) + parts <- c(version, sprintf(shafmt %||% " [sha: %s]", substring(sha, 1L, 7L))) + paste(parts, collapse = "") + } + + renv_bootstrap_exec <- function(project, libpath, version) { + if (!renv_bootstrap_load(project, libpath, version)) + renv_bootstrap_run(version, libpath) + } + + renv_bootstrap_run <- function(version, libpath) { + + # perform bootstrap + bootstrap(version, libpath) + + # exit early if we're just testing bootstrap + if (!is.na(Sys.getenv("RENV_BOOTSTRAP_INSTALL_ONLY", unset = NA))) + return(TRUE) + + # try again to load + if (requireNamespace("renv", lib.loc = libpath, quietly = TRUE)) { + return(renv::load(project = getwd())) + } + + # failed to download or load renv; warn the user + msg <- c( + "Failed to find an renv installation: the project will not be loaded.", + "Use `renv::activate()` to re-initialize the project." + ) + + warning(paste(msg, collapse = "\n"), call. = FALSE) + + } + + renv_json_read <- function(file = NULL, text = NULL) { + + jlerr <- NULL + + # if jsonlite is loaded, use that instead + if ("jsonlite" %in% loadedNamespaces()) { + + json <- catch(renv_json_read_jsonlite(file, text)) + if (!inherits(json, "error")) + return(json) + + jlerr <- json + + } + + # otherwise, fall back to the default JSON reader + json <- catch(renv_json_read_default(file, text)) + if (!inherits(json, "error")) + return(json) + + # report an error + if (!is.null(jlerr)) + stop(jlerr) + else + stop(json) + + } + + renv_json_read_jsonlite <- function(file = NULL, text = NULL) { + text <- paste(text %||% read(file), collapse = "\n") + jsonlite::fromJSON(txt = text, simplifyVector = FALSE) + } + + renv_json_read_default <- function(file = NULL, text = NULL) { + + # find strings in the JSON + text <- paste(text %||% read(file), collapse = "\n") + pattern <- '["](?:(?:\\\\.)|(?:[^"\\\\]))*?["]' + locs <- gregexpr(pattern, text, perl = TRUE)[[1]] + + # if any are found, replace them with placeholders + replaced <- text + strings <- character() + replacements <- character() + + if (!identical(c(locs), -1L)) { + + # get the string values + starts <- locs + ends <- locs + attr(locs, "match.length") - 1L + strings <- substring(text, starts, ends) + + # only keep those requiring escaping + strings <- grep("[[\\]{}:]", strings, perl = TRUE, value = TRUE) + + # compute replacements + replacements <- sprintf('"\032%i\032"', seq_along(strings)) + + # replace the strings + mapply(function(string, replacement) { + replaced <<- sub(string, replacement, replaced, fixed = TRUE) + }, strings, replacements) + + } + + # transform the JSON into something the R parser understands + transformed <- replaced + transformed <- gsub("{}", "`names<-`(list(), character())", transformed, fixed = TRUE) + transformed <- gsub("[[{]", "list(", transformed, perl = TRUE) + transformed <- gsub("[]}]", ")", transformed, perl = TRUE) + transformed <- gsub(":", "=", transformed, fixed = TRUE) + text <- paste(transformed, collapse = "\n") + + # parse it + json <- parse(text = text, keep.source = FALSE, srcfile = NULL)[[1L]] + + # construct map between source strings, replaced strings + map <- as.character(parse(text = strings)) + names(map) <- as.character(parse(text = replacements)) + + # convert to list + map <- as.list(map) + + # remap strings in object + remapped <- renv_json_remap(json, map) + + # evaluate + eval(remapped, envir = baseenv()) + + } + + renv_json_remap <- function(json, map) { + + # fix names + if (!is.null(names(json))) { + lhs <- match(names(json), names(map), nomatch = 0L) + rhs <- match(names(map), names(json), nomatch = 0L) + names(json)[rhs] <- map[lhs] + } + + # fix values + if (is.character(json)) + return(map[[json]] %||% json) + + # handle true, false, null + if (is.name(json)) { + text <- as.character(json) + if (text == "true") + return(TRUE) + else if (text == "false") + return(FALSE) + else if (text == "null") + return(NULL) + } + + # recurse + if (is.recursive(json)) { + for (i in seq_along(json)) { + json[i] <- list(renv_json_remap(json[[i]], map)) + } + } + + json + + } + + # load the renv profile, if any + renv_bootstrap_profile_load(project) + + # construct path to library root + root <- renv_bootstrap_library_root(project) + + # construct library prefix for platform + prefix <- renv_bootstrap_platform_prefix() + + # construct full libpath + libpath <- file.path(root, prefix) + + # run bootstrap code + renv_bootstrap_exec(project, libpath, version) + + invisible() + +}) diff --git a/renv/settings.json b/renv/settings.json new file mode 100644 index 0000000..74c1d4b --- /dev/null +++ b/renv/settings.json @@ -0,0 +1,19 @@ +{ + "bioconductor.version": null, + "external.libraries": [], + "ignored.packages": [], + "package.dependency.fields": [ + "Imports", + "Depends", + "LinkingTo" + ], + "ppm.enabled": null, + "ppm.ignored.urls": [], + "r.version": null, + "snapshot.type": "explicit", + "use.cache": true, + "vcs.ignore.cellar": true, + "vcs.ignore.library": true, + "vcs.ignore.local": true, + "vcs.manage.ignores": true +} From b1daf88cb68969c1bfb2c14cbfed2736306150ec Mon Sep 17 00:00:00 2001 From: Angelo D'Ambrosio Date: Fri, 8 Mar 2024 15:37:45 +0100 Subject: [PATCH 02/17] fix: fix dependency installation --- R/utils.R | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/R/utils.R b/R/utils.R index 8108cfd..a487ea3 100644 --- a/R/utils.R +++ b/R/utils.R @@ -17,9 +17,16 @@ check_and_install_dependencies <- function(deps) { # If not, ask the user if they want to install it if (interactive()) { # Only in interactive sessions, otherwise just stop - is_installed <- utils::menu( + do_install <- utils::menu( c("Yes", "No"), title = paste0(dep, " is not installed. Install it now?")) == 1 + + if(do_install) { + try({ + install.packages(dep) + is_installed <- TRUE + }) + } } } From 4a2d159575f5a4e1ebb504820f92387e863af8a9 Mon Sep 17 00:00:00 2001 From: Angelo D'Ambrosio Date: Fri, 8 Mar 2024 15:38:25 +0100 Subject: [PATCH 03/17] fix: add "integer" type among accepted time classes --- R/utils.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/utils.R b/R/utils.R index a487ea3..a8b5d7f 100644 --- a/R/utils.R +++ b/R/utils.R @@ -85,7 +85,7 @@ parse_event_time <- function(time, format = c("R", "T")) { #' time_to_numeric <- function(time, origin = NULL) { - if (!inherits(time, c("character", "POSIXct", "numeric"))) { + if (!inherits(time, c("character", "POSIXct", "numeric", "integer"))) { stop("Invalid time format for parameter 'time'") } From b66b912cd0c7e802c4b9b33e2d9f279e64db172d Mon Sep 17 00:00:00 2001 From: Angelo D'Ambrosio Date: Fri, 8 Mar 2024 15:39:13 +0100 Subject: [PATCH 04/17] ux: catch fatal whisper API error --- R/speech_to_text.R | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/R/speech_to_text.R b/R/speech_to_text.R index e7e978b..21be5ea 100644 --- a/R/speech_to_text.R +++ b/R/speech_to_text.R @@ -385,6 +385,10 @@ use_azure_whisper_stt <- function( # Make the HTTP request response <- httr::POST(url, headers, body = body) + if (response$status_code == 424) { + stop("Fatal error: ", httr::content(response, "text")) + } + # Check response status if (response$status_code != 200) { From bcf15eada4bb384deb6d536eccb7fbde3d28d085 Mon Sep 17 00:00:00 2001 From: Angelo D'Ambrosio Date: Fri, 8 Mar 2024 15:43:17 +0100 Subject: [PATCH 05/17] fix: handle empty transcript subsets and ensure non-zero summary tree length This commit introduces additional checks in the summarization process to handle cases where a transcript subset for a talk is empty. It adds a warning to inform the user if the transcript subset is empty due to incorrect event start times or agenda times. Furthermore, it ensures that the final result tree is not empty, and if it is, the process is stopped with an error message indicating that no talks were summarized. This helps in preventing the generation of empty summaries and guides the user to check their input data. Additionally, the commit includes a minor fix to align the indentation of arguments in the `summarise_transcript` function for improved code readability. --- R/summarization.R | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/R/summarization.R b/R/summarization.R index e33066b..17103f0 100644 --- a/R/summarization.R +++ b/R/summarization.R @@ -271,7 +271,7 @@ summarise_transcript <- function( args <- args[ c("event_description", "recording_details", "audience", - "summary_structure", "extra_output_instructions") + "summary_structure", "extra_output_instructions") ] # Aggregate the summaries @@ -412,6 +412,17 @@ summarise_full_meeting <- function( .data$start >= agenda_element$from, .data$start <= agenda_element$to) + if (nrow(transcript_subset) == 0) { + warning("The transcript subset for the talk ", id, " is empty. ", + if (!is.null(event_start_time)) { + "Did you provide the correct event start time?" + } else { + "Did you provide the correct agenda times?" + }, ". Skipping.", + call. = FALSE, immediate. = TRUE) + next + } + # Extract the details of the talk recording_details <- generate_recording_details(agenda_element) @@ -449,6 +460,10 @@ summarise_full_meeting <- function( dput(result_tree, file = output_file) } + if (length(result_tree) == 0) { + stop("The final result tree has lenght zero. No talks were summarised.") + } + # Return the result tree invisibly invisible(result_tree) } From 2e7abbc8249c85d1e3055ac65ad7a679fbfe5628 Mon Sep 17 00:00:00 2001 From: Angelo D'Ambrosio Date: Fri, 8 Mar 2024 15:49:12 +0100 Subject: [PATCH 06/17] fix: more error management in the llm prompt func --- R/LLM_calls.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/LLM_calls.R b/R/LLM_calls.R index 282b766..d9d479c 100644 --- a/R/LLM_calls.R +++ b/R/LLM_calls.R @@ -35,7 +35,7 @@ #' process_messages <- function(messages) { - if (missing(messages) || is.null(messages)) { + if (missing(messages) || is.null(messages) || length(messages) == 0) { stop("User messages are required.") } @@ -199,7 +199,7 @@ interrogate_llm <- function( if (httr::status_code(response) == 429) { warning("Rate limit exceeded. Waiting before retrying.", - immediate. = TRUE) + immediate. = TRUE, call. = FALSE) to_wait <- as.numeric(httr::headers(response)$`retry-after`) message("Waiting for ", to_wait, " seconds.\n...") From 6849d8f46c45fe7b48f028d2e51db44509f6e532 Mon Sep 17 00:00:00 2001 From: Angelo D'Ambrosio Date: Fri, 8 Mar 2024 15:49:35 +0100 Subject: [PATCH 07/17] feat: no need to specify "user" if single prompt --- R/LLM_calls.R | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/R/LLM_calls.R b/R/LLM_calls.R index d9d479c..5e946fa 100644 --- a/R/LLM_calls.R +++ b/R/LLM_calls.R @@ -39,6 +39,13 @@ process_messages <- function(messages) { stop("User messages are required.") } + # Assume that a single message is from the user + if (length(messages) == 1 && + is.character(messages) && + is.null(names(messages))) { + messages <- c(user = messages) + } + # Convert vector to list format vector_to_list <- function(msg_vec) { From 0f5b5a67815976188ecf2df0dd8f6634bbc59aa6 Mon Sep 17 00:00:00 2001 From: Angelo D'Ambrosio Date: Fri, 8 Mar 2024 15:50:18 +0100 Subject: [PATCH 08/17] fix: manages validation of integers as agenda times they get converted to numeric first --- R/validation.R | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/R/validation.R b/R/validation.R index bb554ac..0738dad 100644 --- a/R/validation.R +++ b/R/validation.R @@ -41,6 +41,11 @@ validate_agenda_element <- function( # Check if the times are interpretable for (time in c("from", "to")) { + # Convert integer times to numeric to simplify the validation + if (inherits(agenda_element[[time]], "integer")) { + agenda_element[[time]] <- as.numeric(agenda_element[[time]]) + } + if (!inherits(agenda_element[[time]], c("numeric", "POSIXct", "character"))) { stop(stringr::str_glue( From 3c4e877f5d953abd93c54f8e6ae04d94f51843ba Mon Sep 17 00:00:00 2001 From: Angelo D'Ambrosio Date: Fri, 8 Mar 2024 15:50:50 +0100 Subject: [PATCH 09/17] fix: manages empty transcription json files --- R/data_management.R | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/R/data_management.R b/R/data_management.R index 56f562a..bd4d34c 100644 --- a/R/data_management.R +++ b/R/data_management.R @@ -81,6 +81,11 @@ parse_transcript_json <- function( "Please remove it and try transcription again.") } + if (length(transcript_list[[i]]$segments) == 0) { + # skip this file, there was nothing to transcribe + next + } + transcript_data <- transcript_list[[i]]$segments |> bind_rows() |> # Select only the columns to import From 41b823add34d18f0feeaed54d45a00a601a9b8e0 Mon Sep 17 00:00:00 2001 From: Angelo D'Ambrosio Date: Fri, 8 Mar 2024 15:51:17 +0100 Subject: [PATCH 10/17] fix: manage transcription files with no speaker info --- R/data_management.R | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/R/data_management.R b/R/data_management.R index bd4d34c..f566598 100644 --- a/R/data_management.R +++ b/R/data_management.R @@ -285,11 +285,12 @@ extract_text_from_transcript <- function( # Ignore the `import_diarization` parameter if the transcript does not contain # speaker information - if (all(is.na(transcript_data$speaker)) || + if (!"speaker" %in% names(transcript_data) || + all(is.na(transcript_data$speaker)) || n_distinct(transcript_data$speaker, na.rm = T) == 1) { import_diarization <- FALSE - transcript_data$speaker <- "None" + transcript_data$speaker <- "Unknown" } transcript <- transcript_data %>% From e9afb2d44b4f9de1c3ab3dc30905a9295954edc5 Mon Sep 17 00:00:00 2001 From: Angelo D'Ambrosio Date: Fri, 8 Mar 2024 15:52:26 +0100 Subject: [PATCH 11/17] clean unused code --- R/data_management.R | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/R/data_management.R b/R/data_management.R index f566598..7336d28 100644 --- a/R/data_management.R +++ b/R/data_management.R @@ -392,16 +392,6 @@ convert_agenda_times <- function( validate_agenda_element(agenda[[i]], from = TRUE, to = TRUE) } - # if ( - # convert_to == "clocktime" && - # inherits(agenda[[1]]["from"], c("POSIXct", "character"))) { - # - # warning("Agenda already in clock time format.", - # call. = FALSE, immediate. = TRUE) - # - # return(agenda_orig) - # } - # Check if agenda times are all of the same class if (!all(purrr::map_lgl(agenda, ~ is.numeric(.x$from))) && !all(purrr::map_lgl(agenda, ~ { From c458b0d9f9ebe7b20ad1775c44beb69712cfa933 Mon Sep 17 00:00:00 2001 From: Angelo D'Ambrosio Date: Fri, 8 Mar 2024 16:00:36 +0100 Subject: [PATCH 12/17] feat: implement agenda inference from transcripts Add a new function `infer_agenda_from_transcript` and related prompt generation functions. This function automates the generation of an event agenda by analyzing a given transcript. It tries to identify and extract key segments from the transcript, which are then used to construct an agenda. The process can be informed by contextual information such as event description, vocabulary, diarization instructions, and an expected agenda to guide the LLM in generating a more accurate and context-aware agenda. Additionally, the function handles JSON parsing errors and adjusts the processing window size dynamically to ensure valid JSON output from the LLM. The agenda inference process is designed to be robust, with the ability to resume from temporary data if the process is interrupted. This enhancement streamlines the workflow for summarizing meetings and conferences by providing a structured overview of the event's proceedings. --- R/data_management.R | 57 +++++-- R/prompts.R | 227 ++++++++++++++++++++++++-- R/summarization.R | 387 ++++++++++++++++++++++++++++++++++++++++++++ README.Rmd | 22 +++ README.md | 24 +++ 5 files changed, 694 insertions(+), 23 deletions(-) diff --git a/R/data_management.R b/R/data_management.R index 7336d28..cfc22d4 100644 --- a/R/data_management.R +++ b/R/data_management.R @@ -984,8 +984,25 @@ add_chat_transcript <- function( #' generate the chat file. See `add_chat_transcript` for more details. #' @param agenda The agenda of the meeting, that is, a list of agenda elements #' each with a session name, a title, speaker and moderator lists, type of -#' talk and start and end times. Alternatively, the path to an R file -#' containing such a list. See `summarise_full_meeting` for more details. +#' talk, talk description and start and end times. Alternatively, the path to +#' an R file containing such a list. See `summarise_full_meeting` for more +#' details. If NULL, the user will be asked if the system should try to +#' generate the agenda automatically, using the `infer_agenda_from_transcript` +#' function. +#' @param expected_agenda A character string. Only used if the `agenda` argument +#' is `NULL` and the user requests the automatic agenda generation. this +#' string will be used to drive the LLM while generating the agenda. See +#' `infer_agenda_from_transcript` for more details. +#' @param agenda_generation_window_size The size of the window in seconds to +#' analyze at once when generating the agenda. See +#' `infer_agenda_from_transcript` for more details. +#' @param agenda_generation_output_file A string with the path to the output +#' file where the automatically generated agenda will be written. Should be a +#' .R file. See `infer_agenda_from_transcript` for more details. +#' @param extra_agenda_generation_args Additional arguments passed to the +#' `infer_agenda_from_transcript` function. See `infer_agenda_from_transcript` +#' for more details. Note that the `diarization_instructions` argument for this +#' function will be taken from the `extra_agenda_generation_args` argument. #' @param summarization_method A string indicating the summarization method to #' use. See `summarise_full_meeting` for more details. #' @param event_description A string containing a description of the meeting. @@ -1000,9 +1017,9 @@ add_chat_transcript <- function( #' should take into account the diarization of the transcript. See #' `summarise_transcript` for more details. #' @param summary_structure,extra_diarization_instructions,extra_output_instructions -#' Specific instructions necessary to build the summarisation prompt. See -#' `summarise_transcript` for more details and run `get_prompts()` to see the -#' defaults. See `summarise_transcript` for more details. +#' Specific instructions necessary to build the summarisation prompt. See +#' `summarise_transcript` for more details and run `get_prompts()` to see the +#' defaults. See `summarise_transcript` for more details. #' @param llm_provider A string indicating the LLM provider to use for the #' summarization. See `summarise_transcript` for more details. #' @param extra_summarise_args Additional arguments passed to the @@ -1063,8 +1080,12 @@ speech_to_summary_workflow <- function( , full.names = T)[1], chat_format = "webex", - # Arguments for `summarise_full_meeting` + # Arguments for `summarise_full_meeting` and `infer_agenda_from_transcript` agenda = file.path(target_dir, "agenda.R"), + expected_agenda = NULL, + agenda_generation_window_size = 7200, + agenda_generation_output_file = file.path(target_dir, "agenda.R"), + extra_agenda_generation_args = NULL, event_description = NULL, event_audience = "An audience with understanding of the topic", @@ -1245,7 +1266,7 @@ speech_to_summary_workflow <- function( } else { choice <- utils::menu( choices = c( - "Generate a default agenda (i.e., process the transcript as one talk)", + "Generate the agenda automatically (You will need to review it before proceeding)", "Exit (write your own agenda)" ), title = "How do you want to proceed?" @@ -1258,12 +1279,22 @@ speech_to_summary_workflow <- function( } # Generate a default agenda with 1 talk/meeting if none is provided - agenda <- list( - list( - from = min(transcript_data$start), - to = max(transcript_data$end) - ) - ) + agenda_infer_args <- c(list( + transcript = transcript_data, + event_description = event_description, + vocabulary = vocabulary, + diarization_instructions = extra_diarization_instructions, + start_time = event_start_time, + expected_agenda = expected_agenda, + window_size = agenda_generation_window_size, + output_file = file.path(target_dir, "agenda.R"), + provider = llm_provider + ), extra_agenda_generation_args) + + agenda <- do.call(infer_agenda_from_transcript, agenda_infer_args) + + message("Agenda generated. Please review it before proceeding.") + return(invisible(transcript_data)) } message("\n### Summarizing transcript...\n") diff --git a/R/prompts.R b/R/prompts.R index 652b648..f17ead3 100644 --- a/R/prompts.R +++ b/R/prompts.R @@ -74,6 +74,29 @@ set_prompts <- function( base_task = "Your task is to provide a summary of the transcript segments that will be given to you.", aggregate_task_rolling = "Your task is to aggregate the summaries generated from the segments of a meeting/talk into a single, comprehensive text, without loss of information.", + agenda_inference_task = collapse( + "Your task is to extract individual talks from this transcript, creating an agenda. You can identify them from a change of speaker, and or, a change of topic. Try to detect broad changes of topics so to avoid splitting the transcript into an excessively large number of small talks; a talk usually last at least 10-15 minutes to one hour, so join into one talk very short change of topics, even if the speaker change. Aggregate both the talk itself and possible Q&A sessions in the same talk.", + "You wil be FIRST producing a step by step reasoning of what could be a good subdivision of the transcript into different talks, considering different competing subdivisions, and THEN suggest the agenda. Take speakers, topics, and timings into consideration in your reasoning.", + "Your output will be a JSON object with two components: your reasoning and the actual agenda. The agenda must be an array of \"talk\" objects, each with the talk title, a short description (1 sentence), a label with the type of talk (e.g. welcome talk, conference outline, conference talk, meeting discussion, Q&A session, etc...), an array with one of more speakers, another array with moderators (if detectable) and the starting and end time in seconds. Add also the \"session\" object if it make sense as grouping.", + "Here's an example of the output structure:", + "### + { + reasoning = \"Your reasoning goes here\", + agenda = [ + { + title = \"The talk title\", + type = \"Conference talk\", + description = \"A description of this talk\", + speakers = [\"speaker 1\", \"speaker 2\"], + moderators = [\"moderator 1\"] # If detectable, otherwise ignore this field + from = 1231, to = 2023 + }, + {...}, /* another talk element */ + ... + ] + } + ###", + "Important: process the whole transcript, do not be lazy: your agenda should cover the entirety of the transcript."), event_description_template = collapse( "The following is a description of the event in which the talk/meeting took place, which will provide you with context.", @@ -93,18 +116,15 @@ set_prompts <- function( "", "{transcript}", "" ), - # transcript_template_one_shot = collapse( - # "Here is the transcript segment you need to summarise:", - # "", "{transcript}", "" - # ), - # - # transcript_template_rolling = collapse( - # "Here is the transcript of the segment you need to summarise:", - # "", "[...]\n{transcript}\n[...]", "" - # ), - aggregate_template_rolling = "Here are the segment summaries to aggregate:", + agenda_inference_template = collapse( + "This is the transcript of an event/meeting:\n", + "{transcript}", + "\n", + "The transcript is formatted as a csv with the start and end time of each segment and the segment text." + ), + vocabulary_template = "Mind that the transcript is not perfect and the following and other terms and names may have been wrongly transcribed. Here's a list of technical terms, acronyms and names you may find in the trascript and that may have been wrongly transcribed:\n{vocabulary}.\nRecognize and correct misspelled versions of these and other related terms and names.", diarization_template = collapse( @@ -447,3 +467,190 @@ generate_rolling_aggregation_prompt <- function( stringr::str_glue_data(prompt, .x = args, .null = NULL) } + + +#' Generate the agenda inference prompt +#' +#' This function is used by `infer_agenda_from_transcript()` to generate a +#' prompt for inferring the agenda from a transcript. +#' +#' @param transcript_segment A segment of the transcript to be used for +#' inferring the agenda. Can be a character vector representing the data in CSV +#' format or a data frame. +#' @param args A list of arguments to be passed to the prompt template. They can +#' include: event_description, vocabulary and expected_agenda. +#' +#' @return A prompt used by `infer_agenda_from_transcript()`. +#' +generate_agenda_inference_prompt <- function( + transcript_segment, + args +) { + + if (is.data.frame(transcript_segment)) { + transcript_segment <- readr::format_csv(transcript_segment) + } + + if (!is.null(args$vocabulary)) { + # Format the vocabulary argument if a vector is provided + args$vocabulary <- paste0( + "- ", + args$vocabulary, + collapse = "\n" + ) + } + + # Aggregate instructions if length > 1 vectors and convert into the + # extra_diarization_instructions argument + if (length(args$diarization_instructions) > 0) { + args$extra_diarization_instructions <- paste( + args$diarization_instructions, collapse = "\n" + ) + } + + long_arguments <- purrr::map_lgl(args, ~ length(.x) > 1) + + if (any(long_arguments)) { + stop("All arguments in args should have length 1:\n", + stringr::str_flatten_comma(names(args)[long_arguments])) + } + + prompt <- paste( + "Your task is to extract individual talks from a transcript, creating an agenda.", + + if (!is.null(args$event_description)) { + # Uses the {event_description} argument + get_prompts("event_description_template") + }, + + if (!is.null(args$vocabulary)) { + # Uses the {vocabulary} argument + get_prompts("vocabulary_template") + }, + + # Uses the {extra_diarization_instructions} argument + if (!is.null(args$diarization_instructions)) { + get_prompts("diarization_template") + }, + + "This is the transcript of the event/meeting from which you need to infer the agenda items:\n\n{transcript_segment}\n\n\nThe transcript is formatted as a csv with the start and end time of each segment, the segment text and possibly, the speakers.", + + sep = "\n\n" + ) |> + stringr::str_glue_data(.x = args, .null = NULL) |> + paste( + 'You can identify the talks from a change of speakers, and or, a change of topic. Try to detect broad changes of topics so to avoid splitting the transcript into an excessively large number of small talks; a talk usually last at least 10-15 minutes to one hour, so join into one talk very short change of topics, even if the speaker change. Aggregate talks and the related Q&A sessions in the same talk. + +You wil be FIRST producing an INFORMATION DENSE, step by step reasoning of what could be a good subdivision of the transcript into different talks, considering different competing subdivisions, listing each identified talk start time and topics. THEN you will extract the starting times of each talk. + +Take speakers, topics, and timings into consideration in your reasoning. The reasoning doesn\'t have to be human readable. Favor a high information over length ratio.', + + if (!is.null(args$expected_agenda)) { + stringr::str_glue_data( + .x = args, + .null = NULL, + "The agenda is expected to have the following talks: ### +{expected_agenda} +### +Try to match the agenda you generated to this structure.") + }, + + 'Your output will be a JSON object with two components: your reasoning and the start times of each identified talks. Here\'s an example of the output structure: +### + { + reasoning = "Your reasoning goes here", + start_times = [1, 232, 1242, 2343, 5534, 7023, ...] + } + ### + +Important: process the whole transcript, do not be lazy: your agenda WILL cover the entirety of the transcript, FROM START TO END WITHOUT TIME HOLES.', + + sep ="\n" + ) +} + +#' Generate the prompt to extract an agenda element details from a transcript +#' +#' This function is used by `infer_agenda_from_transcript()` to generate a +#' prompt for extracting the details of an agenda element from a transcript. +#' +#' @param transcript_segment A segment of the transcript to be used for +#' extracting the details of an agenda element. Can be a character vector +#' representing the data in CSV format or a data frame. +#' @param args A list of arguments to be passed to the prompt template. They can +#' include: event_description and vocabulary. +#' +#' @return A prompt used by `infer_agenda_from_transcript()`. +#' +generate_agenda_element_prompt <- function( + transcript_segment, + args +) { + + if (is.data.frame(transcript_segment)) { + transcript_segment <- readr::format_csv(transcript_segment) + } + + if (!is.null(args$vocabulary)) { + # Format the vocabulary argument if a vector is provided + args$vocabulary <- paste0( + "- ", + args$vocabulary, + collapse = "\n" + ) + } + + # Aggregate instructions if length > 1 vectors and convert into the + # extra_diarization_instructions argument + if (length(args$diarization_instructions) > 0) { + args$extra_diarization_instructions <- paste( + args$diarization_instructions, collapse = "\n" + ) + } + + long_arguments <- purrr::map_lgl(args, ~ length(.x) > 1) + + if (any(long_arguments)) { + stop("All arguments in args should have length 1:\n", + stringr::str_flatten_comma(names(args)[long_arguments])) + } + + prompt <- paste( + "This is a segment of the transcript of an event/meeting: + +\n{transcript_segment}\n + +The transcript is formatted as a csv with the start and end time of each segment, the segment text and possibly, the speakers.", + + if (!is.null(args$event_description)) { + # Uses the {event_description} argument + get_prompts("event_description_template") + }, + + if (!is.null(args$vocabulary)) { + # Uses the {vocabulary} argument + get_prompts("vocabulary_template") + }, + + # Uses the {extra_diarization_instructions} argument + if (!is.null(args$diarization_instructions)) { + get_prompts("diarization_template") + }, + + sep = "\n\n" + ) |> + stringr::str_glue_data(.x = args, .null = NULL) |> + paste( + 'Your task is to extract a title and a short description (1-2 sentences max) for this talk, considering that it\'s part of a larger event. Assign also a label, e.g., welcome talk, conference outline, conference talk, meeting discussion, Q&A session, etc... (the start/end times can be helpful for this). Extract also the speakers and the moderators (if any). Format your output as a JSON object with the following structure: ### + { + title = "The talk title", + type = "A label to define the talk", + description = "A description of this talk", + speakers = ["speaker 1", "speaker 2"], + moderators = ["moderator 1"] # If detectable, otherwise ignore this field + } + ###', + + sep = "\n\n" + ) +} diff --git a/R/summarization.R b/R/summarization.R index 17103f0..7d2f5dd 100644 --- a/R/summarization.R +++ b/R/summarization.R @@ -467,3 +467,390 @@ summarise_full_meeting <- function( # Return the result tree invisibly invisible(result_tree) } + + +#' Infer the agenda from a transcript +#' +#' This function takes a transcript and various optional parameters, and uses an +#' LLM to generate an agenda. +#' +#' @param transcript The transcript to be summarised. Can be a file path or a +#' data frame. +#' @param event_description A description of the event. Provide context about +#' the event. +#' @param vocabulary A character vector of specific vocabulary words, names, +#' definitions, to help the LLM recognise misspellings and abbreviations. +#' @param start_time The start time of the event in the HH:MM(:SS)( AM/PM) +#' format. Necessary to convert the agenda times from seconds to an easier to +#' read format. +#' @param expected_agenda The expected agenda of the event. A text description +#' of the expected agenda. If provided, the LLM will be asked to generate an +#' agenda that matches this description. +#' @param window_size The time window that will be taken into account when +#' inferring the agenda. Default is 2 hours. A larger window will increase the +#' accuracy of the agenda since it will provide context and will prevent to +#' have talks crossing the window boundaries; also decrease the chance of +#' having the LLM being over sensitive to small changes in topics, generating +#' too many small talks. However, a larger window will also require a larger +#' LLM context. +#' @param output_file An optional file to save the results to. Default is NULL, +#' i.e., the results are not saved to a file. +#' @param ... Additional arguments passed to the `interrogate_llm` function. +#' Keep in consideration that this function needs LLMs that manages long +#' context and that produce valid JSON outputs. The `force_json` argument is +#' used with OpenAI based LLM but it's not accepted by other LLMs; therefore +#' the user may need to edit the system prompts to ensure that the output is a +#' valid JSON. +#' +#' @return An agenda in the usual list format. +#' +#' @export +#' +infer_agenda_from_transcript <- function( + transcript, + event_description = NULL, + vocabulary = NULL, + diarization_instructions = NULL, + start_time = NULL, + expected_agenda = NULL, + window_size = 3600, + output_file = NULL, + ... +) { + + # Set the default prompts if not already set + set_prompts() + + # import the transcript if it's a file path + if (is.character(transcript)) { + # Is the transcript a CSV file? + if (stringr::str_detect(transcript, "\\.csv$")) { + transcript_data <- readr::read_csv(transcript, show_col_types = FALSE) + } + # Is the transcript a subtitle file? + else { + transcript_data <- import_transcript_from_file(transcript) + } + } else if (is.data.frame(transcript)) { + transcript_data <- transcript + } else { + stop("The transcript must be a file path or a data frame.") + } + + transcript_data <- transcript_data |> + select(start, end, text, any_of("speaker")) |> + mutate( + across(c(start, end), ~ round(.x)), + ) |> + filter(!is_silent(text)) + + breakpoints <- seq( + transcript_data$start[1], max(transcript_data$start), by = window_size) + + pause_duration <- 1200 + + pauses <- transcript_data |> + filter( + start - lag(end, default = 0) > pause_duration + ) |> pull(start) + + breakpoints <- c(breakpoints, pauses) |> sort() + + for (i in which(breakpoints %in% pauses)) { + if (breakpoints[i] - breakpoints[i - 1] < pause_duration) { + breakpoints <- breakpoints[-(i - 1)] + } + + if (breakpoints[i + 1] - breakpoints[i] < pause_duration) { + breakpoints <- breakpoints[-(i + 1)] + } + } + + last_segment <- max(transcript_data$start) - tail(breakpoints, n=1) + + # Adjust if the last segment is less than window_size / 2 seconds + if (last_segment < (window_size / 2)) { + breakpoints <- utils::head(breakpoints, -1) + } + + stop <- FALSE + cur_bp <- 1 + json_error <- FALSE + + # Check if there was an already started session that got interrupted + arg_hash <- rlang::hash( + list( + transcript_data = transcript_data, + event_description = event_description, + vocabulary = vocabulary, + diarization_instructions = diarization_instructions, + start_time = start_time, + expected_agenda = expected_agenda, + window_size = window_size) + ) + + # Reset the temporary agenda if the arguments have changed + if (is.null(getOption("minutemaker_temp_agenda"))) { + options( + "minutemaker_temp_agenda" = list(), + "minutemaker_temp_agenda_last_bp" = NULL + ) + } else if (getOption("minutemaker_temp_agenda_hash", "") != arg_hash) { + options( + "minutemaker_temp_agenda" = list(), + "minutemaker_temp_agenda_last_bp" = NULL + ) + } else { + message("A temporary agenda was found. Resuming the inference.") + } + + options("minutemaker_temp_agenda_hash" = arg_hash) + + update_agenda <- function(agenda_elements) { + cur_agenda <- c( + getOption("minutemaker_temp_agenda", list()), + agenda_elements |> sort() + ) + + options("minutemaker_temp_agenda" = cur_agenda) + } + + message("- Inferring the agenda from the transcript") + + while (isFALSE(stop)) { + + bp_left <- breakpoints[cur_bp] + bp_right <- breakpoints[cur_bp + 1] + + # Stop if reached the end + if (is.na(bp_right)) { + bp_right <- max(transcript_data$start) + 1 + } + + # Check if the current segment was already processed + if (cur_bp <= getOption("minutemaker_temp_agenda_last_bp", 0)) { + if (cur_bp == length(breakpoints)) stop <- TRUE + + cur_bp <- cur_bp + 1 + + next + } + + transcript_segment <- transcript_data |> + dplyr::filter( + .data$start >= bp_left, + .data$start < bp_right + ) + + # Skip empty segments + if (nrow(transcript_segment) == 0) { + if (cur_bp == length(breakpoints)) stop <- TRUE + + cur_bp <- cur_bp + 1 + + next + } + + transcript_segment <- transcript_segment |> readr::format_csv() + + prompt <- generate_agenda_inference_prompt( + transcript_segment, + args = mget( + c("event_description", "vocabulary", + "diarization_instructions", "expected_agenda"), + ifnotfound = list(NULL)) + ) + + # Build the prompt set + prompt_set <- c( + system = get_prompts("persona"), + user = prompt + ) + + # If this is a retry for failed json parsing, add the previous result to the + # prompt set and add instructions to fix the output + if (json_error) { + prompt_set <- c( + prompt_set, + assistant = result_json, + user = "Your output was not a valid JSON. + Please correct it to provide a valid output.") + } + + # Attempt to interrogate the LLM + result_json <- try(interrogate_llm( + prompt_set, + ..., + force_json = TRUE + ), silent = TRUE) + + # If the interrogation fails due to too long output, retry with a smaller + # window + if (inherits(result_json, "try-error") && + grepl("Answer exhausted the context window", result_json)) { + + warning( + "Answer exhausted the context window. retrying...", + immediate. = T, call. = F) + + # Add a new breakpoint in the middle of the current segment + new_bp <- (bp_left + bp_right) / 2 + breakpoints <- sort(c(breakpoints, new_bp)) + + # Prevent stopping, in case the error happened on the last segment + stop <- FALSE + + next + } else if (inherits(result_json, "try-error")) { + + stop(result_json) + + } + + cat(result_json) + + # Attempt to parse the result json + parsed_result <- try( + jsonlite::fromJSON(result_json, simplifyDataFrame = F)$start_times, + silent = TRUE) + + # If the parsing fails... + if (inherits(parsed_result, "try-error")) { + + # If this is the first parsing error, retry with instructions to fix the + # output + if (!json_error) { + warning( + "Output not a valid JSON. retrying...", + immediate. = T, call. = F) + + json_error <- TRUE + } + # If this is the second parsing error, shorten the window + else { + + warning( + "Output not a valid JSON. Shortening the window...", + immediate. = T, call. = F) + + json_error <- FALSE + + # Add a new breakpoint in the middle of the current segment + new_bp <- (bp_left + bp_right) / 2 + breakpoints <- sort(c(breakpoints, new_bp)) + + } + + # Prevent stopping, in case the error happened on the last segment + stop <- FALSE + + next + } + + # If the parsing is successful, update the agenda + update_agenda(parsed_result) + + json_error <- FALSE + + options("minutemaker_temp_agenda_last_bp" = cur_bp) + + if (cur_bp == length(breakpoints)) stop <- TRUE + + cur_bp <- cur_bp + 1 + + } + + agenda_times <- getOption("minutemaker_temp_agenda", list()) + + if (length(agenda_times) == 0) { + warning("No agenda was inferred from the transcript.", + immediate. = T, call. = F) + return(NULL) + } + + # Remove segments that are too short or that precede the previous one. + agenda_times <- agenda_times |> purrr::imap(\(x, i) { + if (i == 1) return(agenda_times[[i]]) + + this_time <- agenda_times[[i]] + prev_time <- agenda_times[[i - 1]] + + # segments should last at least 5 minutes and not be negative + if (this_time - prev_time < 150) return(NULL) + + return(this_time) + }) |> unlist() + + message("- Extracting agenda items details") + + # Extract the talks' details from the transcript + agenda <- purrr::imap(agenda_times, \(start, i) { + # if (i == 1) start <- 1 + + # Stop at the end of the transcript if there is no next agenda element + end <- min( + c(agenda_times[i + 1], max(transcript_data$end)), + na.rm = TRUE) + + # Stop at the pause if there is one in the talk segment + pauses <- pauses[between(pauses, start, end)] + end <- min(c(end, pauses), na.rm = TRUE) + + element <- list( + # Sometimes, int are produced, which creates problems when converting to + # clocktime + from = as.numeric(start), + to = as.numeric(end) + ) + + transcript_segment <- transcript_data |> + filter( + .data$start >= element$from, + .data$end <= element$to, + ) |> readr::format_csv() + + prompt <- generate_agenda_element_prompt( + transcript_segment, + # I cannot use mget here because the prompt function is not in the + # environment of the calling function. Probably there's a way to use mget + # also here + args = list( + event_description = event_description, + vocabulary = vocabulary, + diarization_instructions = diarization_instructions) + ) + + # Build the prompt set + prompt_set <- c( + system = get_prompts("persona"), + user = prompt + ) + + result_json <- interrogate_llm( + prompt_set, + ..., force_json = TRUE + ) + + jsonlite::fromJSON(result_json, simplifyDataFrame = F) |> + c(element) + }) + + if (!is.null(start_time)) { + agenda <- agenda |> + convert_agenda_times( + convert_to = "clocktime", + event_start_time = start_time) + } + + if (!is.null(output_file)) { + dput(agenda, file = output_file) + } + + options( + minutemaker_temp_agenda_last_bp = NULL, + minutemaker_temp_agenda = NULL, + minutemaker_temp_agenda_hash = NULL + ) + + agenda +} diff --git a/README.Rmd b/README.Rmd index e131206..6795020 100644 --- a/README.Rmd +++ b/README.Rmd @@ -51,6 +51,9 @@ provide a longer context window and better summarization quality. Otherwise, the "rolling window" summarization method can be used to produce summaries of consistent quality on long transcripts also with smaller models. +In general, we suggest to use an LLM with a >32K long context window, to avoid +loss of information. + Here is an example workflow. ### Setting up the package @@ -304,6 +307,16 @@ timings are not mandatory if the meeting consists of only one talk. The agenda object itself is not strictly necessary, but can be helpful for long meetings with multiple talks. +An alternative approach is to generate the agenda automatically via the +`infer_agenda_from_transcript()` function. This function uses the transcript to +infer the different sessions of the talk, their start and end times and also +generate a description and a title. The function can use contextual information +to improve the quality of the generated agenda, such as the event description, +the audience, a vocabulary, and the expected agenda. + +It's important to review and correct the inferred agenda, since the function +might not be able to infer the correct structure of the meeting. + ### Summarizing a single meeting transcript The final step is summarizing the transcript. This can be done using the @@ -537,7 +550,16 @@ speech_to_summary_workflow( # Arguments for `summarise_full_meeting` # Assumes an existing agenda.R file in the working directory + # If an agenda doesn't exist, will ask if the LLM should infer it (see + # following arguments) agenda = "agenda.R", + + # Arguments for `infer_agenda_from_transcript` + expected_agenda = NULL, + agenda_generation_window_size = 7200, + agenda_generation_output_file = file.path(target_dir, "agenda.R"), + extra_agenda_generation_args = NULL, + summarization_output_file = "event_summary.R", event_description = event_description, diff --git a/README.md b/README.md index 857194d..4e01e8c 100644 --- a/README.md +++ b/README.md @@ -44,6 +44,9 @@ quality. Otherwise, the “rolling window” summarization method can be used to produce summaries of consistent quality on long transcripts also with smaller models. +In general, we suggest to use an LLM with a \>32K long context window, +to avoid loss of information. + Here is an example workflow. ### Setting up the package @@ -294,6 +297,18 @@ transcript. The timings are not mandatory if the meeting consists of only one talk. The agenda object itself is not strictly necessary, but can be helpful for long meetings with multiple talks. +An alternative approach is to generate the agenda automatically via the +`infer_agenda_from_transcript()` function. This function uses the +transcript to infer the different sessions of the talk, their start and +end times and also generate a description and a title. The function can +use contextual information to improve the quality of the generated +agenda, such as the event description, the audience, a vocabulary, and +the expected agenda. + +It’s important to review and correct the inferred agenda, since the +function might not be able to infer the correct structure of the +meeting. + ### Summarizing a single meeting transcript The final step is summarizing the transcript. This can be done using the @@ -525,7 +540,16 @@ speech_to_summary_workflow( # Arguments for `summarise_full_meeting` # Assumes an existing agenda.R file in the working directory + # If an agenda doesn't exist, will ask if the LLM should infer it (see + # following arguments) agenda = "agenda.R", + + # Arguments for `infer_agenda_from_transcript` + expected_agenda = NULL, + agenda_generation_window_size = 7200, + agenda_generation_output_file = file.path(target_dir, "agenda.R"), + extra_agenda_generation_args = NULL, + summarization_output_file = "event_summary.R", event_description = event_description, From 9361f3768f4b38d9e7cede7bdd844bb31ef201ab Mon Sep 17 00:00:00 2001 From: Angelo D'Ambrosio Date: Fri, 8 Mar 2024 16:01:04 +0100 Subject: [PATCH 13/17] docs: documentation update --- NAMESPACE | 1 + man/generate_agenda_element_prompt.Rd | 25 ++++++++++ man/generate_agenda_inference_prompt.Rd | 23 ++++++++++ man/infer_agenda_from_transcript.Rd | 61 +++++++++++++++++++++++++ man/minutemaker.Rd | 2 +- man/speech_to_summary_workflow.Rd | 29 +++++++++++- 6 files changed, 138 insertions(+), 3 deletions(-) create mode 100644 man/generate_agenda_element_prompt.Rd create mode 100644 man/generate_agenda_inference_prompt.Rd create mode 100644 man/infer_agenda_from_transcript.Rd diff --git a/NAMESPACE b/NAMESPACE index 448d95f..1ee47d5 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -7,6 +7,7 @@ export(format_summary_tree) export(generate_recording_details) export(get_prompts) export(import_transcript_from_file) +export(infer_agenda_from_transcript) export(interrogate_llm) export(merge_transcripts) export(parse_transcript_json) diff --git a/man/generate_agenda_element_prompt.Rd b/man/generate_agenda_element_prompt.Rd new file mode 100644 index 0000000..abe0dfa --- /dev/null +++ b/man/generate_agenda_element_prompt.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/prompts.R +\name{generate_agenda_element_prompt} +\alias{generate_agenda_element_prompt} +\title{Generate the prompt to extract an agenda element details from a transcript} +\usage{ +generate_agenda_element_prompt(transcript_segment, args) +} +\arguments{ +\item{transcript_segment}{A segment of the transcript to be used for +extracting the details of an agenda element. Can be a character vector +representing the data in CSV format or a data frame.} + +\item{args}{A list of arguments to be passed to the prompt template. They can +include: event_description and vocabulary.} +} +\value{ +A prompt used by \code{infer_agenda_from_transcript()}. + +, +} +\description{ +This function is used by \code{infer_agenda_from_transcript()} to generate a +prompt for extracting the details of an agenda element from a transcript. +} diff --git a/man/generate_agenda_inference_prompt.Rd b/man/generate_agenda_inference_prompt.Rd new file mode 100644 index 0000000..fc87324 --- /dev/null +++ b/man/generate_agenda_inference_prompt.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/prompts.R +\name{generate_agenda_inference_prompt} +\alias{generate_agenda_inference_prompt} +\title{Generate the agenda inference prompt} +\usage{ +generate_agenda_inference_prompt(transcript_segment, args) +} +\arguments{ +\item{transcript_segment}{A segment of the transcript to be used for +inferring the agenda. Can be a character vector representing the data in CSV +format or a data frame.} + +\item{args}{A list of arguments to be passed to the prompt template. They can +include: event_description, vocabulary and expected_agenda.} +} +\value{ +A prompt used by \code{infer_agenda_from_transcript()}. +} +\description{ +This function is used by \code{infer_agenda_from_transcript()} to generate a +prompt for inferring the agenda from a transcript. +} diff --git a/man/infer_agenda_from_transcript.Rd b/man/infer_agenda_from_transcript.Rd new file mode 100644 index 0000000..b0f51e3 --- /dev/null +++ b/man/infer_agenda_from_transcript.Rd @@ -0,0 +1,61 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/summarization.R +\name{infer_agenda_from_transcript} +\alias{infer_agenda_from_transcript} +\title{Infer the agenda from a transcript} +\usage{ +infer_agenda_from_transcript( + transcript, + event_description = NULL, + vocabulary = NULL, + diarization_instructions = NULL, + start_time = NULL, + expected_agenda = NULL, + window_size = 3600, + output_file = NULL, + ... +) +} +\arguments{ +\item{transcript}{The transcript to be summarised. Can be a file path or a +data frame.} + +\item{event_description}{A description of the event. Provide context about +the event.} + +\item{vocabulary}{A character vector of specific vocabulary words, names, +definitions, to help the LLM recognise misspellings and abbreviations.} + +\item{start_time}{The start time of the event in the HH:MM(:SS)( AM/PM) +format. Necessary to convert the agenda times from seconds to an easier to +read format.} + +\item{expected_agenda}{The expected agenda of the event. A text description +of the expected agenda. If provided, the LLM will be asked to generate an +agenda that matches this description.} + +\item{window_size}{The time window that will be taken into account when +inferring the agenda. Default is 2 hours. A larger window will increase the +accuracy of the agenda since it will provide context and will prevent to +have talks crossing the window boundaries; also decrease the chance of +having the LLM being over sensitive to small changes in topics, generating +too many small talks. However, a larger window will also require a larger +LLM context.} + +\item{output_file}{An optional file to save the results to. Default is NULL, +i.e., the results are not saved to a file.} + +\item{...}{Additional arguments passed to the \code{interrogate_llm} function. +Keep in consideration that this function needs LLMs that manages long +context and that produce valid JSON outputs. The \code{force_json} argument is +used with OpenAI based LLM but it's not accepted by other LLMs; therefore +the user may need to edit the system prompts to ensure that the output is a +valid JSON.} +} +\value{ +An agenda in the usual list format. +} +\description{ +This function takes a transcript and various optional parameters, and uses an +LLM to generate an agenda. +} diff --git a/man/minutemaker.Rd b/man/minutemaker.Rd index 1680374..85bc140 100644 --- a/man/minutemaker.Rd +++ b/man/minutemaker.Rd @@ -2,8 +2,8 @@ % Please edit documentation in R/minutemaker-package.R \docType{package} \name{minutemaker} -\alias{minutemaker} \alias{minutemaker-package} +\alias{minutemaker} \title{minutemaker} \description{ Package which uses LLM models to generate minutes from audio recordings or transcriptions of meetings. diff --git a/man/speech_to_summary_workflow.Rd b/man/speech_to_summary_workflow.Rd index 66e82cd..ba827bd 100644 --- a/man/speech_to_summary_workflow.Rd +++ b/man/speech_to_summary_workflow.Rd @@ -27,6 +27,10 @@ speech_to_summary_workflow( chat_file = list.files(target_dir, pattern = "Chat", full.names = T)[1], chat_format = "webex", agenda = file.path(target_dir, "agenda.R"), + expected_agenda = NULL, + agenda_generation_window_size = 7200, + agenda_generation_output_file = file.path(target_dir, "agenda.R"), + extra_agenda_generation_args = NULL, event_description = NULL, event_audience = "An audience with understanding of the topic", vocabulary = NULL, @@ -125,8 +129,29 @@ generate the chat file. See \code{add_chat_transcript} for more details.} \item{agenda}{The agenda of the meeting, that is, a list of agenda elements each with a session name, a title, speaker and moderator lists, type of -talk and start and end times. Alternatively, the path to an R file -containing such a list. See \code{summarise_full_meeting} for more details.} +talk, talk description and start and end times. Alternatively, the path to +an R file containing such a list. See \code{summarise_full_meeting} for more +details. If NULL, the user will be asked if the system should try to +generate the agenda automatically, using the \code{infer_agenda_from_transcript} +function.} + +\item{expected_agenda}{A character string. Only used if the \code{agenda} argument +is \code{NULL} and the user requests the automatic agenda generation. this +string will be used to drive the LLM while generating the agenda. See +\code{infer_agenda_from_transcript} for more details.} + +\item{agenda_generation_window_size}{The size of the window in seconds to +analyze at once when generating the agenda. See +\code{infer_agenda_from_transcript} for more details.} + +\item{agenda_generation_output_file}{A string with the path to the output +file where the automatically generated agenda will be written. Should be a +.R file. See \code{infer_agenda_from_transcript} for more details.} + +\item{extra_agenda_generation_args}{Additional arguments passed to the +\code{infer_agenda_from_transcript} function. See \code{infer_agenda_from_transcript} +for more details. Note that the \code{diarization_instructions} argument for this +function will be taken from the \code{extra_agenda_generation_args} argument.} \item{event_description}{A string containing a description of the meeting. See \code{summarise_transcript} for more details.} From b336572f60f8d840b122f32a71a6693e4272c06e Mon Sep 17 00:00:00 2001 From: Angelo D'Ambrosio Date: Fri, 8 Mar 2024 16:01:14 +0100 Subject: [PATCH 14/17] version bump --- DESCRIPTION | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index aac5239..26a0171 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,10 +1,11 @@ Package: minutemaker Title: GenAI-based meeting and conferences minutes generator -Version: 0.5.4 +Version: 0.6.0 Authors@R: person("Angelo", "D'Ambrosio", , "a.dambrosioMD@gmail.com", role = c("aut", "cre"), comment = c(ORCID = "0000-0002-2045-5155")) -Description: Generate meeting minutes starting from an audio recording or a transcripts using speech-to-text and LLMs. +Description: Generate meeting minutes starting from an audio recording or a + transcripts using speech-to-text and LLMs. License: MIT + file LICENSE Imports: dplyr (>= 1.1.4), @@ -20,10 +21,12 @@ Imports: Config/testthat/edition: 3 Encoding: UTF-8 Roxygen: list(markdown = TRUE) -RoxygenNote: 7.2.3 +RoxygenNote: 7.3.1 Suggests: av (>= 0.9.0), + devtools (>= 2.4.5), parallel (>= 4.3.2), testthat (>= 3.0.0), text2vec (>= 0.6.4), tictoc (>= 1.2), + usethis (>= 2.2.3) From 4f199086f93cd846fd61c9944ed99c547dfef8b6 Mon Sep 17 00:00:00 2001 From: Angelo D'Ambrosio Date: Fri, 8 Mar 2024 16:51:17 +0100 Subject: [PATCH 15/17] uniformed defatults --- R/data_management.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/data_management.R b/R/data_management.R index cfc22d4..2e1fc24 100644 --- a/R/data_management.R +++ b/R/data_management.R @@ -1083,7 +1083,7 @@ speech_to_summary_workflow <- function( # Arguments for `summarise_full_meeting` and `infer_agenda_from_transcript` agenda = file.path(target_dir, "agenda.R"), expected_agenda = NULL, - agenda_generation_window_size = 7200, + agenda_generation_window_size = 3600, agenda_generation_output_file = file.path(target_dir, "agenda.R"), extra_agenda_generation_args = NULL, From dbff135d70e5b73ec32d6c8a64a776712db37737 Mon Sep 17 00:00:00 2001 From: Angelo D'Ambrosio Date: Fri, 8 Mar 2024 16:51:27 +0100 Subject: [PATCH 16/17] typo correction --- R/prompts.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/prompts.R b/R/prompts.R index f17ead3..3252e75 100644 --- a/R/prompts.R +++ b/R/prompts.R @@ -76,7 +76,7 @@ set_prompts <- function( aggregate_task_rolling = "Your task is to aggregate the summaries generated from the segments of a meeting/talk into a single, comprehensive text, without loss of information.", agenda_inference_task = collapse( "Your task is to extract individual talks from this transcript, creating an agenda. You can identify them from a change of speaker, and or, a change of topic. Try to detect broad changes of topics so to avoid splitting the transcript into an excessively large number of small talks; a talk usually last at least 10-15 minutes to one hour, so join into one talk very short change of topics, even if the speaker change. Aggregate both the talk itself and possible Q&A sessions in the same talk.", - "You wil be FIRST producing a step by step reasoning of what could be a good subdivision of the transcript into different talks, considering different competing subdivisions, and THEN suggest the agenda. Take speakers, topics, and timings into consideration in your reasoning.", + "You will be FIRST producing a step by step reasoning of what could be a good subdivision of the transcript into different talks, considering different competing subdivisions, and THEN suggest the agenda. Take speakers, topics, and timings into consideration in your reasoning.", "Your output will be a JSON object with two components: your reasoning and the actual agenda. The agenda must be an array of \"talk\" objects, each with the talk title, a short description (1 sentence), a label with the type of talk (e.g. welcome talk, conference outline, conference talk, meeting discussion, Q&A session, etc...), an array with one of more speakers, another array with moderators (if detectable) and the starting and end time in seconds. Add also the \"session\" object if it make sense as grouping.", "Here's an example of the output structure:", "### From 4584c7317c719999497f378b8de60ab8c7d2967c Mon Sep 17 00:00:00 2001 From: Angelo D'Ambrosio Date: Fri, 8 Mar 2024 16:51:44 +0100 Subject: [PATCH 17/17] increase robustness --- R/utils.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/R/utils.R b/R/utils.R index a8b5d7f..0a80f75 100644 --- a/R/utils.R +++ b/R/utils.R @@ -24,7 +24,8 @@ check_and_install_dependencies <- function(deps) { if(do_install) { try({ install.packages(dep) - is_installed <- TRUE + # After successful installation, recheck if the package is now installed + is_installed <- requireNamespace(dep, quietly = FALSE) }) } }