-
Notifications
You must be signed in to change notification settings - Fork 73
New issue
Have a question about this project? # for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “#”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? # to your account
Add TrainGCNV input specifying subset list of samples for training #294
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -218,6 +218,62 @@ task RandomSubsampleStringArray { | |||||
} | ||||||
} | ||||||
|
||||||
task GetSubsampledIndices { | ||||||
input { | ||||||
Array[String] all_strings | ||||||
Array[String] subset_strings | ||||||
String prefix | ||||||
String sv_pipeline_base_docker | ||||||
RuntimeAttr? runtime_attr_override | ||||||
} | ||||||
|
||||||
String subsample_indices_filename = "~{prefix}.subsample_indices.list" | ||||||
String subsampled_strings_filename = "~{prefix}.subsampled_strings.list" | ||||||
|
||||||
RuntimeAttr default_attr = object { | ||||||
cpu_cores: 1, | ||||||
mem_gb: 3.75, | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
disk_gb: 10, | ||||||
boot_disk_gb: 10, | ||||||
preemptible_tries: 3, | ||||||
max_retries: 1 | ||||||
} | ||||||
RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) | ||||||
|
||||||
command <<< | ||||||
|
||||||
set -euo pipefail | ||||||
python3 <<CODE | ||||||
all_strings = ['~{sep="','" all_strings}'] | ||||||
subset_strings = {'~{sep="','" subset_strings}'} | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It would be more robust/scalable to use |
||||||
if not subset_strings.issubset(set(all_strings)): | ||||||
raise ValueError("Subset list must be a subset of full list") | ||||||
with open("~{subsample_indices_filename}", 'w') as indices, open("~{subsampled_strings_filename}", 'w') as strings: | ||||||
for i, string in enumerate(all_strings): | ||||||
if string in subset_strings: | ||||||
indices.write(f"{i}\n") | ||||||
strings.write(string + "\n") # also write sample IDs to ensure the subset order matches the overall order | ||||||
CODE | ||||||
|
||||||
>>> | ||||||
|
||||||
output { | ||||||
Array[Int] subsample_indices_array = read_lines(subsample_indices_filename) | ||||||
Array[String] subsampled_strings_array = read_lines(subsampled_strings_filename) | ||||||
} | ||||||
|
||||||
runtime { | ||||||
cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) | ||||||
memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" | ||||||
disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" | ||||||
bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) | ||||||
docker: sv_pipeline_base_docker | ||||||
preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) | ||||||
maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) | ||||||
} | ||||||
} | ||||||
|
||||||
|
||||||
task SubsetPedFile { | ||||||
input { | ||||||
File ped_file | ||||||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think it would be simpler/clearer to move this into the
scatter
below:There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Also minor style comment, I might rename this to something making it clear it isn't an input. I usually add an extra underscore to the end, or you could call it
maybe_subsetted_sample_ids
for example.