From 164505387fa450e3d3d3a7ff506f674260f60a6a Mon Sep 17 00:00:00 2001 From: Emma Pierce-Hoffman Date: Thu, 10 Feb 2022 11:57:05 -0500 Subject: [PATCH 1/2] add TrainGCNV input specifying subset list of samples for training --- wdl/TrainGCNV.wdl | 23 +++++++++++++++---- wdl/Utils.wdl | 56 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 75 insertions(+), 4 deletions(-) diff --git a/wdl/TrainGCNV.wdl b/wdl/TrainGCNV.wdl index fd03bff31..fc60f3e00 100644 --- a/wdl/TrainGCNV.wdl +++ b/wdl/TrainGCNV.wdl @@ -18,8 +18,12 @@ workflow TrainGCNV { File reference_index # Index (.fai), must be in same dir as fasta File reference_dict # Dictionary (.dict), must be in same dir as fasta + # Options for subsetting samples for training. Both options require providing sv_pipeline_base_docker + # Assumes all other inputs correspond to the full sample list. Intended for Terra Int? n_samples_subsample # Number of samples to subsample from provided sample list for trainGCNV (rec: ~100) Int subsample_seed = 42 + # Subset of full sample list on which to train the gCNV model. Overrides n_samples_subsample if both provided + Array[String]? sample_ids_training_subset # Condense read counts Int? condense_num_bins @@ -85,7 +89,7 @@ workflow TrainGCNV { String linux_docker String gatk_docker String condense_counts_docker - String? sv_pipeline_base_docker # required if using n_samples_subsample to select samples + String? sv_pipeline_base_docker # required if using n_samples_subsample or sample_ids_training_subset to subset samples # Runtime configuration overrides RuntimeAttr? condense_counts_runtime_attr @@ -100,7 +104,17 @@ workflow TrainGCNV { RuntimeAttr? runtime_attr_explode } - if (defined(n_samples_subsample)) { + if (defined(sample_ids_training_subset)) { + call util.GetSubsampledIndices { + input: + all_strings = samples, + subset_strings = select_first([sample_ids_training_subset]), + prefix = cohort, + sv_pipeline_base_docker = select_first([sv_pipeline_base_docker]) + } + } + + if (defined(n_samples_subsample) && !defined(sample_ids_training_subset)) { call util.RandomSubsampleStringArray { input: strings = samples, @@ -111,7 +125,8 @@ workflow TrainGCNV { } } - Array[Int] sample_indices = select_first([RandomSubsampleStringArray.subsample_indices_array, range(length(samples))]) + Array[Int] sample_indices = select_first([GetSubsampledIndices.subsample_indices_array, RandomSubsampleStringArray.subsample_indices_array, range(length(samples))]) + Array[String] sample_ids = select_first([GetSubsampledIndices.subsampled_strings_array, RandomSubsampleStringArray.subsampled_strings_array, samples]) scatter (i in sample_indices) { call cov.CondenseReadCounts as CondenseReadCounts { @@ -138,7 +153,7 @@ workflow TrainGCNV { preprocessed_intervals = CountsToIntervals.out, filter_intervals = filter_intervals, counts = CondenseReadCounts.out, - count_entity_ids = select_first([RandomSubsampleStringArray.subsampled_strings_array, samples]), + count_entity_ids = sample_ids, cohort_entity_id = cohort, contig_ploidy_priors = contig_ploidy_priors, num_intervals_per_scatter = num_intervals_per_scatter, diff --git a/wdl/Utils.wdl b/wdl/Utils.wdl index a5a14353e..2f2f8ec16 100644 --- a/wdl/Utils.wdl +++ b/wdl/Utils.wdl @@ -218,6 +218,62 @@ task RandomSubsampleStringArray { } } +task GetSubsampledIndices { + input { + Array[String] all_strings + Array[String] subset_strings + String prefix + String sv_pipeline_base_docker + RuntimeAttr? runtime_attr_override + } + + String subsample_indices_filename = "~{prefix}.subsample_indices.list" + String subsampled_strings_filename = "~{prefix}.subsampled_strings.list" + + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 3.75, + disk_gb: 10, + boot_disk_gb: 10, + preemptible_tries: 3, + max_retries: 1 + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + + command <<< + + set -euo pipefail + python3 <>> + + output { + Array[Int] subsample_indices_array = read_lines(subsample_indices_filename) + Array[String] subsampled_strings_array = read_lines(subsampled_strings_filename) + } + + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + docker: sv_pipeline_base_docker + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + } +} + + task SubsetPedFile { input { File ped_file From ff64f42e37661f65ea61920973999148162c87a4 Mon Sep 17 00:00:00 2001 From: Emma Pierce-Hoffman Date: Thu, 10 Feb 2022 15:35:51 -0500 Subject: [PATCH 2/2] address review comments --- wdl/TrainGCNV.wdl | 10 +++++----- wdl/Utils.wdl | 21 +++++++++------------ 2 files changed, 14 insertions(+), 17 deletions(-) diff --git a/wdl/TrainGCNV.wdl b/wdl/TrainGCNV.wdl index fc60f3e00..167b5045e 100644 --- a/wdl/TrainGCNV.wdl +++ b/wdl/TrainGCNV.wdl @@ -107,8 +107,8 @@ workflow TrainGCNV { if (defined(sample_ids_training_subset)) { call util.GetSubsampledIndices { input: - all_strings = samples, - subset_strings = select_first([sample_ids_training_subset]), + all_strings = write_lines(samples), + subset_strings = write_lines(select_first([sample_ids_training_subset])), prefix = cohort, sv_pipeline_base_docker = select_first([sv_pipeline_base_docker]) } @@ -117,7 +117,7 @@ workflow TrainGCNV { if (defined(n_samples_subsample) && !defined(sample_ids_training_subset)) { call util.RandomSubsampleStringArray { input: - strings = samples, + strings = write_lines(samples), seed = subsample_seed, subset_size = select_first([n_samples_subsample]), prefix = cohort, @@ -126,9 +126,9 @@ workflow TrainGCNV { } Array[Int] sample_indices = select_first([GetSubsampledIndices.subsample_indices_array, RandomSubsampleStringArray.subsample_indices_array, range(length(samples))]) - Array[String] sample_ids = select_first([GetSubsampledIndices.subsampled_strings_array, RandomSubsampleStringArray.subsampled_strings_array, samples]) scatter (i in sample_indices) { + String sample_ids_ = samples[i] call cov.CondenseReadCounts as CondenseReadCounts { input: counts = count_files[i], @@ -153,7 +153,7 @@ workflow TrainGCNV { preprocessed_intervals = CountsToIntervals.out, filter_intervals = filter_intervals, counts = CondenseReadCounts.out, - count_entity_ids = sample_ids, + count_entity_ids = sample_ids_, cohort_entity_id = cohort, contig_ploidy_priors = contig_ploidy_priors, num_intervals_per_scatter = num_intervals_per_scatter, diff --git a/wdl/Utils.wdl b/wdl/Utils.wdl index 2f2f8ec16..f3d8b81ed 100644 --- a/wdl/Utils.wdl +++ b/wdl/Utils.wdl @@ -159,7 +159,7 @@ task RunQC { task RandomSubsampleStringArray { input { - Array[String] strings + File strings Int seed Int subset_size String prefix @@ -172,7 +172,7 @@ task RandomSubsampleStringArray { RuntimeAttr default_attr = object { cpu_cores: 1, - mem_gb: 3.75, + mem_gb: 1, disk_gb: 10, boot_disk_gb: 10, preemptible_tries: 3, @@ -185,7 +185,7 @@ task RandomSubsampleStringArray { set -euo pipefail python3 < array_len: raise ValueError("Subsample quantity ~{subset_size} cannot > array length %d" % array_len) @@ -220,19 +220,18 @@ task RandomSubsampleStringArray { task GetSubsampledIndices { input { - Array[String] all_strings - Array[String] subset_strings + File all_strings + File subset_strings String prefix String sv_pipeline_base_docker RuntimeAttr? runtime_attr_override } String subsample_indices_filename = "~{prefix}.subsample_indices.list" - String subsampled_strings_filename = "~{prefix}.subsampled_strings.list" RuntimeAttr default_attr = object { cpu_cores: 1, - mem_gb: 3.75, + mem_gb: 1, disk_gb: 10, boot_disk_gb: 10, preemptible_tries: 3, @@ -244,22 +243,20 @@ task GetSubsampledIndices { set -euo pipefail python3 <>> output { Array[Int] subsample_indices_array = read_lines(subsample_indices_filename) - Array[String] subsampled_strings_array = read_lines(subsampled_strings_filename) } runtime {