From 164505387fa450e3d3d3a7ff506f674260f60a6a Mon Sep 17 00:00:00 2001
From: Emma Pierce-Hoffman <epierceh@broadinstitute.org>
Date: Thu, 10 Feb 2022 11:57:05 -0500
Subject: [PATCH 1/2] add TrainGCNV input specifying subset list of samples for
 training

---
 wdl/TrainGCNV.wdl | 23 +++++++++++++++----
 wdl/Utils.wdl     | 56 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 75 insertions(+), 4 deletions(-)

diff --git a/wdl/TrainGCNV.wdl b/wdl/TrainGCNV.wdl
index fd03bff31..fc60f3e00 100644
--- a/wdl/TrainGCNV.wdl
+++ b/wdl/TrainGCNV.wdl
@@ -18,8 +18,12 @@ workflow TrainGCNV {
     File reference_index    # Index (.fai), must be in same dir as fasta
     File reference_dict     # Dictionary (.dict), must be in same dir as fasta
 
+    # Options for subsetting samples for training. Both options require providing sv_pipeline_base_docker
+    # Assumes all other inputs correspond to the full sample list. Intended for Terra
     Int? n_samples_subsample # Number of samples to subsample from provided sample list for trainGCNV (rec: ~100)
     Int subsample_seed = 42
+    # Subset of full sample list on which to train the gCNV model. Overrides n_samples_subsample if both provided
+    Array[String]? sample_ids_training_subset
 
     # Condense read counts
     Int? condense_num_bins
@@ -85,7 +89,7 @@ workflow TrainGCNV {
     String linux_docker
     String gatk_docker
     String condense_counts_docker
-    String? sv_pipeline_base_docker # required if using n_samples_subsample to select samples
+    String? sv_pipeline_base_docker # required if using n_samples_subsample or sample_ids_training_subset to subset samples
 
     # Runtime configuration overrides
     RuntimeAttr? condense_counts_runtime_attr
@@ -100,7 +104,17 @@ workflow TrainGCNV {
     RuntimeAttr? runtime_attr_explode
   }
 
-  if (defined(n_samples_subsample)) {
+  if (defined(sample_ids_training_subset)) {
+    call util.GetSubsampledIndices {
+      input:
+        all_strings = samples,
+        subset_strings = select_first([sample_ids_training_subset]),
+        prefix = cohort,
+        sv_pipeline_base_docker = select_first([sv_pipeline_base_docker])
+    }
+  }
+
+  if (defined(n_samples_subsample) && !defined(sample_ids_training_subset)) {
     call util.RandomSubsampleStringArray {
       input:
         strings = samples,
@@ -111,7 +125,8 @@ workflow TrainGCNV {
     }
   }
 
-  Array[Int] sample_indices = select_first([RandomSubsampleStringArray.subsample_indices_array, range(length(samples))])
+  Array[Int] sample_indices = select_first([GetSubsampledIndices.subsample_indices_array, RandomSubsampleStringArray.subsample_indices_array, range(length(samples))])
+  Array[String] sample_ids = select_first([GetSubsampledIndices.subsampled_strings_array, RandomSubsampleStringArray.subsampled_strings_array, samples])
 
   scatter (i in sample_indices) {
     call cov.CondenseReadCounts as CondenseReadCounts {
@@ -138,7 +153,7 @@ workflow TrainGCNV {
       preprocessed_intervals = CountsToIntervals.out,
       filter_intervals = filter_intervals,
       counts = CondenseReadCounts.out,
-      count_entity_ids = select_first([RandomSubsampleStringArray.subsampled_strings_array, samples]),
+      count_entity_ids = sample_ids,
       cohort_entity_id = cohort,
       contig_ploidy_priors = contig_ploidy_priors,
       num_intervals_per_scatter = num_intervals_per_scatter,
diff --git a/wdl/Utils.wdl b/wdl/Utils.wdl
index a5a14353e..2f2f8ec16 100644
--- a/wdl/Utils.wdl
+++ b/wdl/Utils.wdl
@@ -218,6 +218,62 @@ task RandomSubsampleStringArray {
   }
 }
 
+task GetSubsampledIndices {
+  input {
+    Array[String] all_strings
+    Array[String] subset_strings
+    String prefix
+    String sv_pipeline_base_docker
+    RuntimeAttr? runtime_attr_override
+  }
+
+  String subsample_indices_filename = "~{prefix}.subsample_indices.list"
+  String subsampled_strings_filename = "~{prefix}.subsampled_strings.list"
+
+  RuntimeAttr default_attr = object {
+    cpu_cores: 1,
+    mem_gb: 3.75,
+    disk_gb: 10,
+    boot_disk_gb: 10,
+    preemptible_tries: 3,
+    max_retries: 1
+  }
+  RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr])
+
+  command <<<
+
+    set -euo pipefail
+    python3 <<CODE
+    all_strings = ['~{sep="','" all_strings}']
+    subset_strings = {'~{sep="','" subset_strings}'}
+    if not subset_strings.issubset(set(all_strings)):
+      raise ValueError("Subset list must be a subset of full list")
+    with open("~{subsample_indices_filename}", 'w') as indices, open("~{subsampled_strings_filename}", 'w') as strings:
+      for i, string in enumerate(all_strings):
+        if string in subset_strings:
+          indices.write(f"{i}\n")
+          strings.write(string + "\n")  # also write sample IDs to ensure the subset order matches the overall order
+    CODE
+
+  >>>
+
+  output {
+    Array[Int] subsample_indices_array = read_lines(subsample_indices_filename)
+    Array[String] subsampled_strings_array = read_lines(subsampled_strings_filename)
+  }
+
+  runtime {
+    cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores])
+    memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB"
+    disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD"
+    bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb])
+    docker: sv_pipeline_base_docker
+    preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries])
+    maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries])
+  }
+}
+
+
 task SubsetPedFile {
   input {
     File ped_file

From ff64f42e37661f65ea61920973999148162c87a4 Mon Sep 17 00:00:00 2001
From: Emma Pierce-Hoffman <epierceh@broadinstitute.org>
Date: Thu, 10 Feb 2022 15:35:51 -0500
Subject: [PATCH 2/2] address review comments

---
 wdl/TrainGCNV.wdl | 10 +++++-----
 wdl/Utils.wdl     | 21 +++++++++------------
 2 files changed, 14 insertions(+), 17 deletions(-)

diff --git a/wdl/TrainGCNV.wdl b/wdl/TrainGCNV.wdl
index fc60f3e00..167b5045e 100644
--- a/wdl/TrainGCNV.wdl
+++ b/wdl/TrainGCNV.wdl
@@ -107,8 +107,8 @@ workflow TrainGCNV {
   if (defined(sample_ids_training_subset)) {
     call util.GetSubsampledIndices {
       input:
-        all_strings = samples,
-        subset_strings = select_first([sample_ids_training_subset]),
+        all_strings = write_lines(samples),
+        subset_strings = write_lines(select_first([sample_ids_training_subset])),
         prefix = cohort,
         sv_pipeline_base_docker = select_first([sv_pipeline_base_docker])
     }
@@ -117,7 +117,7 @@ workflow TrainGCNV {
   if (defined(n_samples_subsample) && !defined(sample_ids_training_subset)) {
     call util.RandomSubsampleStringArray {
       input:
-        strings = samples,
+        strings = write_lines(samples),
         seed = subsample_seed,
         subset_size = select_first([n_samples_subsample]),
         prefix = cohort,
@@ -126,9 +126,9 @@ workflow TrainGCNV {
   }
 
   Array[Int] sample_indices = select_first([GetSubsampledIndices.subsample_indices_array, RandomSubsampleStringArray.subsample_indices_array, range(length(samples))])
-  Array[String] sample_ids = select_first([GetSubsampledIndices.subsampled_strings_array, RandomSubsampleStringArray.subsampled_strings_array, samples])
 
   scatter (i in sample_indices) {
+    String sample_ids_ = samples[i]
     call cov.CondenseReadCounts as CondenseReadCounts {
       input:
         counts = count_files[i],
@@ -153,7 +153,7 @@ workflow TrainGCNV {
       preprocessed_intervals = CountsToIntervals.out,
       filter_intervals = filter_intervals,
       counts = CondenseReadCounts.out,
-      count_entity_ids = sample_ids,
+      count_entity_ids = sample_ids_,
       cohort_entity_id = cohort,
       contig_ploidy_priors = contig_ploidy_priors,
       num_intervals_per_scatter = num_intervals_per_scatter,
diff --git a/wdl/Utils.wdl b/wdl/Utils.wdl
index 2f2f8ec16..f3d8b81ed 100644
--- a/wdl/Utils.wdl
+++ b/wdl/Utils.wdl
@@ -159,7 +159,7 @@ task RunQC {
 
 task RandomSubsampleStringArray {
   input {
-    Array[String] strings
+    File strings
     Int seed
     Int subset_size
     String prefix
@@ -172,7 +172,7 @@ task RandomSubsampleStringArray {
 
   RuntimeAttr default_attr = object {
     cpu_cores: 1,
-    mem_gb: 3.75,
+    mem_gb: 1,
     disk_gb: 10,
     boot_disk_gb: 10,
     preemptible_tries: 3,
@@ -185,7 +185,7 @@ task RandomSubsampleStringArray {
     set -euo pipefail
     python3 <<CODE
     import random
-    string_array = ['~{sep="','" strings}']
+    string_array = [line.rstrip() for line in open("~{strings}", 'r')]
     array_len = len(string_array)
     if ~{subset_size} > array_len:
       raise ValueError("Subsample quantity ~{subset_size} cannot > array length %d" % array_len)
@@ -220,19 +220,18 @@ task RandomSubsampleStringArray {
 
 task GetSubsampledIndices {
   input {
-    Array[String] all_strings
-    Array[String] subset_strings
+    File all_strings
+    File subset_strings
     String prefix
     String sv_pipeline_base_docker
     RuntimeAttr? runtime_attr_override
   }
 
   String subsample_indices_filename = "~{prefix}.subsample_indices.list"
-  String subsampled_strings_filename = "~{prefix}.subsampled_strings.list"
 
   RuntimeAttr default_attr = object {
     cpu_cores: 1,
-    mem_gb: 3.75,
+    mem_gb: 1,
     disk_gb: 10,
     boot_disk_gb: 10,
     preemptible_tries: 3,
@@ -244,22 +243,20 @@ task GetSubsampledIndices {
 
     set -euo pipefail
     python3 <<CODE
-    all_strings = ['~{sep="','" all_strings}']
-    subset_strings = {'~{sep="','" subset_strings}'}
+    all_strings = [line.rstrip() for line in open("~{all_strings}", 'r')]
+    subset_strings = {line.rstrip() for line in open("~{subset_strings}", 'r')}
     if not subset_strings.issubset(set(all_strings)):
       raise ValueError("Subset list must be a subset of full list")
-    with open("~{subsample_indices_filename}", 'w') as indices, open("~{subsampled_strings_filename}", 'w') as strings:
+    with open("~{subsample_indices_filename}", 'w') as indices:
       for i, string in enumerate(all_strings):
         if string in subset_strings:
           indices.write(f"{i}\n")
-          strings.write(string + "\n")  # also write sample IDs to ensure the subset order matches the overall order
     CODE
 
   >>>
 
   output {
     Array[Int] subsample_indices_array = read_lines(subsample_indices_filename)
-    Array[String] subsampled_strings_array = read_lines(subsampled_strings_filename)
   }
 
   runtime {