Skip to content
New issue

Have a question about this project? # for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “#”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? # to your account

Add TrainGCNV input specifying subset list of samples for training #294

Merged
merged 2 commits into from
Feb 11, 2022
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 19 additions & 4 deletions wdl/TrainGCNV.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,12 @@ workflow TrainGCNV {
File reference_index # Index (.fai), must be in same dir as fasta
File reference_dict # Dictionary (.dict), must be in same dir as fasta

# Options for subsetting samples for training. Both options require providing sv_pipeline_base_docker
# Assumes all other inputs correspond to the full sample list. Intended for Terra
Int? n_samples_subsample # Number of samples to subsample from provided sample list for trainGCNV (rec: ~100)
Int subsample_seed = 42
# Subset of full sample list on which to train the gCNV model. Overrides n_samples_subsample if both provided
Array[String]? sample_ids_training_subset

# Condense read counts
Int? condense_num_bins
Expand Down Expand Up @@ -85,7 +89,7 @@ workflow TrainGCNV {
String linux_docker
String gatk_docker
String condense_counts_docker
String? sv_pipeline_base_docker # required if using n_samples_subsample to select samples
String? sv_pipeline_base_docker # required if using n_samples_subsample or sample_ids_training_subset to subset samples

# Runtime configuration overrides
RuntimeAttr? condense_counts_runtime_attr
Expand All @@ -100,7 +104,17 @@ workflow TrainGCNV {
RuntimeAttr? runtime_attr_explode
}

if (defined(n_samples_subsample)) {
if (defined(sample_ids_training_subset)) {
call util.GetSubsampledIndices {
input:
all_strings = samples,
subset_strings = select_first([sample_ids_training_subset]),
prefix = cohort,
sv_pipeline_base_docker = select_first([sv_pipeline_base_docker])
}
}

if (defined(n_samples_subsample) && !defined(sample_ids_training_subset)) {
call util.RandomSubsampleStringArray {
input:
strings = samples,
Expand All @@ -111,7 +125,8 @@ workflow TrainGCNV {
}
}

Array[Int] sample_indices = select_first([RandomSubsampleStringArray.subsample_indices_array, range(length(samples))])
Array[Int] sample_indices = select_first([GetSubsampledIndices.subsample_indices_array, RandomSubsampleStringArray.subsample_indices_array, range(length(samples))])
Array[String] sample_ids = select_first([GetSubsampledIndices.subsampled_strings_array, RandomSubsampleStringArray.subsampled_strings_array, samples])
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it would be simpler/clearer to move this into the scatter below:

scatter (i in sample_indices) {
    String sample_ids = samples[i]
    call cov.CondenseReadCounts as CondenseReadCounts {
      input:
        counts = count_files[i],
        sample = samples[i],
        num_bins = condense_num_bins,
        expected_bin_size = condense_bin_size,
        condense_counts_docker = condense_counts_docker,
        runtime_attr_override=condense_counts_runtime_attr
    }
  }

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also minor style comment, I might rename this to something making it clear it isn't an input. I usually add an extra underscore to the end, or you could call it maybe_subsetted_sample_ids for example.


scatter (i in sample_indices) {
call cov.CondenseReadCounts as CondenseReadCounts {
Expand All @@ -138,7 +153,7 @@ workflow TrainGCNV {
preprocessed_intervals = CountsToIntervals.out,
filter_intervals = filter_intervals,
counts = CondenseReadCounts.out,
count_entity_ids = select_first([RandomSubsampleStringArray.subsampled_strings_array, samples]),
count_entity_ids = sample_ids,
cohort_entity_id = cohort,
contig_ploidy_priors = contig_ploidy_priors,
num_intervals_per_scatter = num_intervals_per_scatter,
Expand Down
56 changes: 56 additions & 0 deletions wdl/Utils.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -218,6 +218,62 @@ task RandomSubsampleStringArray {
}
}

task GetSubsampledIndices {
input {
Array[String] all_strings
Array[String] subset_strings
String prefix
String sv_pipeline_base_docker
RuntimeAttr? runtime_attr_override
}

String subsample_indices_filename = "~{prefix}.subsample_indices.list"
String subsampled_strings_filename = "~{prefix}.subsampled_strings.list"

RuntimeAttr default_attr = object {
cpu_cores: 1,
mem_gb: 3.75,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
mem_gb: 3.75,
mem_gb: 1,

disk_gb: 10,
boot_disk_gb: 10,
preemptible_tries: 3,
max_retries: 1
}
RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr])

command <<<

set -euo pipefail
python3 <<CODE
all_strings = ['~{sep="','" all_strings}']
subset_strings = {'~{sep="','" subset_strings}'}
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It would be more robust/scalable to use write_lines() to write the strings to a file and read them in, rather than defining them inline (who knows what would happen if we fed this 10k samples for example).

if not subset_strings.issubset(set(all_strings)):
raise ValueError("Subset list must be a subset of full list")
with open("~{subsample_indices_filename}", 'w') as indices, open("~{subsampled_strings_filename}", 'w') as strings:
for i, string in enumerate(all_strings):
if string in subset_strings:
indices.write(f"{i}\n")
strings.write(string + "\n") # also write sample IDs to ensure the subset order matches the overall order
CODE

>>>

output {
Array[Int] subsample_indices_array = read_lines(subsample_indices_filename)
Array[String] subsampled_strings_array = read_lines(subsampled_strings_filename)
}

runtime {
cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores])
memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB"
disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD"
bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb])
docker: sv_pipeline_base_docker
preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries])
maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries])
}
}


task SubsetPedFile {
input {
File ped_file
Expand Down