broadinstitute · epiercehoffman · Feb 11, 2022 · Feb 10, 2022 · Feb 10, 2022 · mwalker174
diff --git a/wdl/TrainGCNV.wdl b/wdl/TrainGCNV.wdl
@@ -18,8 +18,12 @@ workflow TrainGCNV {
     File reference_index    # Index (.fai), must be in same dir as fasta
     File reference_dict     # Dictionary (.dict), must be in same dir as fasta
 
+    # Options for subsetting samples for training. Both options require providing sv_pipeline_base_docker
+    # Assumes all other inputs correspond to the full sample list. Intended for Terra
     Int? n_samples_subsample # Number of samples to subsample from provided sample list for trainGCNV (rec: ~100)
     Int subsample_seed = 42
+    # Subset of full sample list on which to train the gCNV model. Overrides n_samples_subsample if both provided
+    Array[String]? sample_ids_training_subset
 
     # Condense read counts
     Int? condense_num_bins
@@ -85,7 +89,7 @@ workflow TrainGCNV {
     String linux_docker
     String gatk_docker
     String condense_counts_docker
-    String? sv_pipeline_base_docker # required if using n_samples_subsample to select samples
+    String? sv_pipeline_base_docker # required if using n_samples_subsample or sample_ids_training_subset to subset samples
 
     # Runtime configuration overrides
     RuntimeAttr? condense_counts_runtime_attr
@@ -100,7 +104,17 @@ workflow TrainGCNV {
     RuntimeAttr? runtime_attr_explode
   }
 
-  if (defined(n_samples_subsample)) {
+  if (defined(sample_ids_training_subset)) {
+    call util.GetSubsampledIndices {
+      input:
+        all_strings = samples,
+        subset_strings = select_first([sample_ids_training_subset]),
+        prefix = cohort,
+        sv_pipeline_base_docker = select_first([sv_pipeline_base_docker])
+    }
+  }
+
+  if (defined(n_samples_subsample) && !defined(sample_ids_training_subset)) {
     call util.RandomSubsampleStringArray {
       input:
         strings = samples,
@@ -111,7 +125,8 @@ workflow TrainGCNV {
     }
   }
 
-  Array[Int] sample_indices = select_first([RandomSubsampleStringArray.subsample_indices_array, range(length(samples))])
+  Array[Int] sample_indices = select_first([GetSubsampledIndices.subsample_indices_array, RandomSubsampleStringArray.subsample_indices_array, range(length(samples))])
+  Array[String] sample_ids = select_first([GetSubsampledIndices.subsampled_strings_array, RandomSubsampleStringArray.subsampled_strings_array, samples])
 
   scatter (i in sample_indices) {
     call cov.CondenseReadCounts as CondenseReadCounts {
@@ -138,7 +153,7 @@ workflow TrainGCNV {
       preprocessed_intervals = CountsToIntervals.out,
       filter_intervals = filter_intervals,
       counts = CondenseReadCounts.out,
-      count_entity_ids = select_first([RandomSubsampleStringArray.subsampled_strings_array, samples]),
+      count_entity_ids = sample_ids,
       cohort_entity_id = cohort,
       contig_ploidy_priors = contig_ploidy_priors,
       num_intervals_per_scatter = num_intervals_per_scatter,

diff --git a/wdl/Utils.wdl b/wdl/Utils.wdl
@@ -218,6 +218,62 @@ task RandomSubsampleStringArray {
   }
 }
 
+task GetSubsampledIndices {
+  input {
+    Array[String] all_strings
+    Array[String] subset_strings
+    String prefix
+    String sv_pipeline_base_docker
+    RuntimeAttr? runtime_attr_override
+  }
+
+  String subsample_indices_filename = "~{prefix}.subsample_indices.list"
+  String subsampled_strings_filename = "~{prefix}.subsampled_strings.list"
+
+  RuntimeAttr default_attr = object {
+    cpu_cores: 1,
+    mem_gb: 3.75,
-    mem_gb: 3.75,
+    mem_gb: 1,
-    mem_gb: 3.75,
+    mem_gb: 1,
+    disk_gb: 10,
+    boot_disk_gb: 10,
+    preemptible_tries: 3,
+    max_retries: 1
+  }
+  RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr])
+
+  command <<<
+
+    set -euo pipefail
+    python3 <<CODE
+    all_strings = ['~{sep="','" all_strings}']
+    subset_strings = {'~{sep="','" subset_strings}'}
+    if not subset_strings.issubset(set(all_strings)):
+      raise ValueError("Subset list must be a subset of full list")
+    with open("~{subsample_indices_filename}", 'w') as indices, open("~{subsampled_strings_filename}", 'w') as strings:
+      for i, string in enumerate(all_strings):
+        if string in subset_strings:
+          indices.write(f"{i}\n")
+          strings.write(string + "\n")  # also write sample IDs to ensure the subset order matches the overall order
+    CODE
+
+  >>>
+
+  output {
+    Array[Int] subsample_indices_array = read_lines(subsample_indices_filename)
+    Array[String] subsampled_strings_array = read_lines(subsampled_strings_filename)
+  }
+
+  runtime {
+    cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores])
+    memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB"
+    disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD"
+    bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb])
+    docker: sv_pipeline_base_docker
+    preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries])
+    maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries])
+  }
+}
+
+
 task SubsetPedFile {
   input {
     File ped_file