Refactoring RT matching (#76)

taxonomicallyinformedannotation · Aug 15, 2023 · d6fcd42 · d6fcd42
1 parent c4da046
commit d6fcd42
Show file tree

Hide file tree

Showing 53 changed files with 1,117 additions and 249 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: timaR
 Title: Taxonomically Informed Metabolite Annotation
-Version: 2.8.2
+Version: 2.9.0
 Authors@R: c(
     person("Adriano", "Rutz", , "rutz@imsb.biol.ethz.ch", role = c("aut", "cre"),
            comment = c(ORCID = "0000-0003-0443-9902")),
@@ -18,7 +18,7 @@ Imports:
     crayon (>= 1.5.2),
     docopt (>= 0.7.1),
     dplyr (>= 1.1.2),
-    httr (>= 1.4.6),
+    httr (>= 1.4.7),
     igraph (>= 1.5.1),
     jsonlite (>= 1.8.7),
     MsBackendMgf (>= 1.8.0),
@@ -93,6 +93,7 @@ Collate:
     'extract_spectra.R'
     'fake_annotations_columns.R'
     'fake_sop_columns.R'
+    'filter_annotations.R'
     'get_file.R'
     'get_example_sirius.R'
     'get_gnps_tables.R'
@@ -117,6 +118,7 @@ Collate:
     'prepare_features_edges.R'
     'prepare_features_tables.R'
     'prepare_libraries_adducts.R'
+    'prepare_libraries_rt.R'
     'prepare_libraries_sop_closed.R'
     'prepare_libraries_sop_ecmdb.R'
     'prepare_libraries_sop_lotus.R'

diff --git a/NAMESPACE b/NAMESPACE
@@ -25,6 +25,7 @@ export(export_spectra_2)
 export(extract_spectra)
 export(fake_annotations_columns)
 export(fake_sop_columns)
+export(filter_annotations)
 export(get_example_sirius)
 export(get_file)
 export(get_gnps_tables)
@@ -50,6 +51,7 @@ export(prepare_features_components)
 export(prepare_features_edges)
 export(prepare_features_tables)
 export(prepare_libraries_adducts)
+export(prepare_libraries_rt)
 export(prepare_libraries_sop_closed)
 export(prepare_libraries_sop_ecmdb)
 export(prepare_libraries_sop_lotus)

diff --git a/NEWS.md b/NEWS.md
@@ -1,5 +1,9 @@
 # timaR
 
+# timaR 2.9.0
+
+* Refactored RT matching (#76)
+
 # timaR 2.8.2
 
 * Change from pbmclapply to pblapply

diff --git a/R/annotate_masses.R b/R/annotate_masses.R
@@ -7,7 +7,6 @@ utils::globalVariables(
     "delta_min",
     "Distance",
     "error_mz",
-    "error_rt",
     "exact_mass",
     "feature_id",
     "feature_id_dest",
@@ -157,8 +156,8 @@ annotate_masses <-
         20
     )
     stopifnot(
-      "Your rt tolerance must be lower or equal to 0.1" = tolerance_rt <=
-        0.1
+      "Your rt tolerance must be lower or equal to 0.05" = tolerance_rt <=
+        0.05
     )
 
     paths <<- parse_yaml_paths()
@@ -465,16 +464,14 @@ annotate_masses <-
         )
       ) |>
       dplyr::mutate(
-        error_mz = adduct_mass - mz_1,
-        error_rt = NA_real_
+        error_mz = adduct_mass - mz_1
       ) |>
       tidytable::select(
         feature_id,
         rt,
         mz,
         score_input,
         error_mz,
-        error_rt,
         exact_mass,
         adduct,
         adduct_mass,
@@ -497,7 +494,6 @@ annotate_masses <-
         score_input,
         library,
         error_mz,
-        error_rt,
         exact_mass
       ) |>
       tidyft::filter(!is.na(exact_mass))
@@ -786,7 +782,6 @@ annotate_masses <-
       tidytable::distinct(
         feature_id,
         error_mz,
-        error_rt,
         structure_name,
         structure_inchikey_2D,
         structure_smiles_2D,

diff --git a/R/annotate_spectra.R b/R/annotate_spectra.R
@@ -7,7 +7,6 @@ utils::globalVariables(
     "paths",
     "precursorMz",
     "presence_ratio",
-    "rtime",
     "score",
     "SLAW_ID",
     "structure_inchikey_2D",
@@ -90,7 +89,6 @@ annotate_spectra <- function(input = params$files$spectral$raw,
   df_empty <- data.frame(
     feature_id = NA,
     error_mz = NA,
-    error_rt = NA,
     structure_name = NA,
     structure_inchikey_2D = NA,
     structure_smiles_2D = NA,
@@ -129,15 +127,11 @@ annotate_spectra <- function(input = params$files$spectral$raw,
 
     query_precursors <- spectra@backend@spectraData$precursorMz
     query_spectra <- spectra@backend@peaksData
-    query_rts <- spectra@backend@spectraData$rtime
     ## TODO find a way to have consistency in spectrum IDs
     query_ids <- spectra@backend@spectraData$acquisitionNum
     if (is.null(query_ids)) {
       query_ids <- spectra@backend@spectraData$spectrum_id
     }
-    if (is.null(query_rts)) {
-      query_rts <- rep(NA_real_, length(spectra))
-    }
 
     if (approx == FALSE) {
       log_debug("Reducing library size...")
@@ -195,7 +189,6 @@ annotate_spectra <- function(input = params$files$spectral$raw,
                    spectral_lib,
                    query_ids,
                    query_spectra,
-                   query_rts,
                    lib_id,
                    minimal,
                    maximal,
@@ -229,7 +222,6 @@ annotate_spectra <- function(input = params$files$spectral$raw,
                     list(
                       "feature_id" = query_ids[[spectrum]],
                       "precursorMz" = precursor,
-                      "rtime" = query_rts[[spectrum]],
                       "target_id" = lib_id[indices][[index]],
                       "score" = as.numeric(score),
                       "count_peaks_matched" = NA_integer_,
@@ -257,7 +249,6 @@ annotate_spectra <- function(input = params$files$spectral$raw,
                 spectral_lib = lib_spectra,
                 query_ids = query_ids,
                 query_spectra = query_spectra,
-                query_rts = query_rts,
                 lib_id = lib_id,
                 minimal = minimal,
                 maximal = maximal
@@ -295,10 +286,6 @@ annotate_spectra <- function(input = params$files$spectral$raw,
     if (is.null(lib_smiles2D)) {
       lib_smiles2D <- rep(NA_character_, length(spectral_library))
     }
-    lib_rts <- spectral_library@backend@spectraData$rtime
-    if (is.null(lib_rts)) {
-      lib_rts <- rep(NA_real_, length(spectral_library))
-    }
     lib_name <- spectral_library@backend@spectraData$name
     if (is.null(lib_name)) {
       lib_name <- rep(NA_character_, length(spectral_library))
@@ -322,7 +309,6 @@ annotate_spectra <- function(input = params$files$spectral$raw,
       "target_inchikey_2D" = lib_inchikey2D,
       "target_smiles" = lib_smiles,
       "target_smiles_2D" = lib_smiles2D,
-      "target_rtime" = lib_rts,
       "target_name" = lib_name,
       "target_formula" = lib_mf,
       "target_exactmass" = lib_mass,
@@ -336,8 +322,6 @@ annotate_spectra <- function(input = params$files$spectral$raw,
     df_final <- df_final |>
       tidytable::rowwise() |>
       dplyr::mutate(
-        ## Working in minutes
-        error_rt = (target_rtime - rtime) / 60,
         error_mz = target_precursorMz - precursorMz,
         structure_inchikey_2D = ifelse(
           test = is.na(target_inchikey_2D),
@@ -357,7 +341,6 @@ annotate_spectra <- function(input = params$files$spectral$raw,
         c(
           "feature_id",
           "error_mz",
-          "error_rt",
           "structure_name" = "target_name",
           "structure_inchikey_2D",
           "structure_smiles_2D",

diff --git a/R/filter_annotations.R b/R/filter_annotations.R
@@ -0,0 +1,98 @@
+utils::globalVariables(c("params"))
+
+#' @title Filter annotations
+#'
+#' @description This function filters initial annotations.
+#'
+#' @param annotations Prepared annotations file
+#' @param features Prepared features file
+#' @param rts Prepared retention time library
+#' @param output Output file
+#' @param tolerance_rt Tolerance to filter retention time
+#' @param parameters Params
+#'
+#' @return NULL
+#'
+#' @export
+#'
+#' @examples NULL
+filter_annotations <-
+  function(annotations = params$files$annotations$prepared,
+           features = params$files$features$prepared,
+           rts = params$files$libraries$temporal$prepared,
+           output = params$files$annotations$filtered,
+           tolerance_rt = params$ms$tolerances$rt$minutes,
+           parameters = params) {
+    stopifnot(
+      "Annotations file(s) do(es) not exist" =
+        rep(TRUE, length(annotations)) ==
+          lapply(X = annotations, file.exists)
+    )
+    stopifnot(
+      "Retention time file(s) do(es) not exist" =
+        rep(TRUE, length(rts)) ==
+          lapply(X = rts, file.exists)
+    )
+    stopifnot("Your features file does not exist." = file.exists(features))
+
+    paths <<- parse_yaml_paths()
+    params <<- parameters
+
+    log_debug(x = "... features")
+    features_table <- tidytable::fread(
+      file = features,
+      colClasses = "character",
+      na.strings = c("", "NA")
+    )
+    log_debug(x = "... annotations")
+    annotation_table <- lapply(
+      X = annotations,
+      FUN = tidytable::fread,
+      colClasses = "character",
+      na.strings = c("", "NA")
+    ) |>
+      tidytable::bind_rows()
+    log_debug(x = "... retention times")
+    rt_table <- lapply(
+      X = rts,
+      FUN = tidytable::fread,
+      colClasses = "character",
+      na.strings = c("", "NA")
+    ) |>
+      tidytable::bind_rows() |>
+      tidytable::rename(rt_target = rt)
+
+    log_debug(
+      "Filtering annotations outside of",
+      tolerance_rt * 3,
+      "minutes tolerance"
+    )
+    features_annotated_table <- features_table |>
+      tidytable::left_join(annotation_table) |>
+      tidytable::left_join(rt_table) |>
+      data.frame() |>
+      dplyr::mutate(error_rt = as.numeric(rt) - as.numeric(rt_target)) |>
+      dplyr::arrange(abs(error_rt)) |>
+      tidytable::tidytable() |>
+      tidytable::distinct(-error_rt, -rt_target,
+        .keep_all = TRUE
+      ) |>
+      data.frame() |>
+      ## TODO adapt for types and improve the * 3
+      dplyr::filter(abs(error_rt) <= abs(tolerance_rt * 3) |
+        is.na(error_rt)) |>
+      tidytable::tidytable() |>
+      tidytable::select(-rt_target, -type)
+
+    ## in case some features had a single filtered annotation
+    final_table <- features_table |>
+      tidytable::left_join(features_annotated_table)
+
+    export_params(step = "filter_annotations")
+    export_output(
+      x = final_table,
+      file = output[[1]]
+    )
+
+    return(output[[1]])
+  }
diff --git a/R/parse_cli_params.R b/R/parse_cli_params.R
@@ -59,6 +59,9 @@ parse_cli_params <- function() {
       params$files$annotations$raw$sirius <-
         as.character(arguments$fil_ann_raw_sir)
     }
+    if (!is.null(arguments$fil_ann_fil)) {
+      params$files$annotations$filtered <- as.character(arguments$fil_ann_fil)
+    }
     if (!is.null(arguments$fil_ann_pre)) {
       params$files$annotations$prepared <- as.character(arguments$fil_ann_pre)
     }
@@ -199,12 +202,13 @@ parse_cli_params <- function() {
     if (!is.null(arguments$names_mgf_po)) {
       params$names$mgf$polarity <- as.character(arguments$names_mgf_po)
     }
-    if (!is.null(arguments$names_mgf_pc)) {
-      params$names$mgf$precursor_charge <- as.character(arguments$names_mgf_pc)
-    }
-    if (!is.null(arguments$names_mgf_pm)) {
-      params$names$mgf$precursor_mz <- as.character(arguments$names_mgf_pm)
-    }
+    # if (!is.null(arguments$names_mgf_pc)) {
+    #   params$names$mgf$precursor_charge <-
+    #     as.character(arguments$names_mgf_pc)
+    # }
+    # if (!is.null(arguments$names_mgf_pm)) {
+    #   params$names$mgf$precursor_mz <- as.character(arguments$names_mgf_pm)
+    # }
     if (!is.null(arguments$names_mgf_sm)) {
       params$names$mgf$smiles <- as.character(arguments$names_mgf_sm)
     }
@@ -270,6 +274,9 @@ parse_cli_params <- function() {
     if (!is.null(arguments$too_tax_che)) {
       params$tools$taxonomies$chemical <- as.character(arguments$too_tax_che)
     }
+    if (!is.null(arguments$units_rt)) {
+      params$units$rt <- as.character(arguments$units_rt)
+    }
     if (!is.null(arguments$wei_glo_bio)) {
       params$weights$global$biological <- as.numeric(arguments$wei_glo_bio)
     }

diff --git a/R/prepare_annotations_gnps.R b/R/prepare_annotations_gnps.R
@@ -5,7 +5,6 @@ utils::globalVariables(
     "count_peaks_explained",
     "count_peaks_matched",
     "error_mz",
-    "error_rt",
     "ExactMass",
     "feature_id",
     "INCHI",
@@ -122,13 +121,11 @@ prepare_annotations_gnps <-
         dplyr::mutate(
           error_mz = as.numeric(MZErrorPPM) *
             1E-6 *
-            as.numeric(Precursor_MZ),
-          error_rt = NA
+            as.numeric(Precursor_MZ)
         ) |>
         tidytable::select(
           feature_id = `#Scan#`,
           error_mz = MassDiff,
-          error_rt,
           library = LibraryName,
           structure_name = Compound_Name,
           score_input = MQScore,

diff --git a/R/prepare_annotations_sirius.R b/R/prepare_annotations_sirius.R
@@ -5,7 +5,6 @@ utils::globalVariables(
     "count_peaks_matched",
     "CSI:FingerIDScore",
     "error_mz",
-    "error_rt",
     "explainedIntensity",
     "feature_id",
     "id",
@@ -251,7 +250,6 @@ prepare_annotations_sirius <-
         tidytable::left_join(canopus_npc_prepared) |>
         tidytable::distinct() |>
         tidyft::mutate(
-          error_rt = NA,
           structure_taxonomy_classyfire_chemontid = NA,
           structure_taxonomy_classyfire_01kingdom = NA,
           ## mirror spectral match

diff --git a/R/prepare_annotations_spectra.R b/R/prepare_annotations_spectra.R
@@ -3,7 +3,6 @@ utils::globalVariables(
     "count_peaks_explained",
     "count_peaks_matched",
     "error_mz",
-    "error_rt",
     "feature_id",
     "params",
     "score",
@@ -109,7 +108,6 @@ prepare_annotations_spectra <-
       tidytable::distinct(
         feature_id,
         error_mz,
-        error_rt,
         structure_name,
         structure_inchikey_2D,
         structure_smiles_2D,