diff --git a/config/nxf_cram.config b/config/nxf_cram.config index 3c32bd50..ea9c9bf5 100644 --- a/config/nxf_cram.config +++ b/config/nxf_cram.config @@ -10,6 +10,13 @@ params { call_str = true call_sv = true call_cnv = true + + coverage { + GRCh38 { + default_bed_exon = "${projectDir}/resources/GRCh38/default_exon.bed" + default_bed_gene = "${projectDir}/resources/GRCh38/default_gene.bed" + } + } } } diff --git a/config/nxf_vcf.config b/config/nxf_vcf.config index 5aefa630..cea480e2 100644 --- a/config/nxf_vcf.config +++ b/config/nxf_vcf.config @@ -111,7 +111,7 @@ params { include_crams = true max_records = "" max_samples = "" - template = "${projectDir}/resources/vip-report-template-v7.1.0.html" + template = "${projectDir}/resources/vip-report-template-v7.1.1.html" config = "${projectDir}/resources/vcf_report_config.json" metadata = "${projectDir}/resources/field_metadata.json" diff --git a/install.sh b/install.sh index 74637bdf..9ed756b1 100755 --- a/install.sh +++ b/install.sh @@ -99,6 +99,8 @@ download_files() { fi urls+=("55d49c8d95ffc9aee2ec584359c197d2" "resources/GRCh38/AlphScore_final_20230825_stripped_GRCh38.tsv.gz") urls+=("c6178d80393254789ebf9c43df6f2d6f" "resources/GRCh38/AlphScore_final_20230825_stripped_GRCh38.tsv.gz.tbi") + urls+=("515f004afad1c782de590730a9b057bd" "resources/GRCh38/default_exon.bed") + urls+=("78a5e20c2176a88ea9a689cb77c8b68d" "resources/GRCh38/default_gene.bed") urls+=("8e842bfe9c1eeb0943a588ff5662b9aa" "resources/GRCh38/GCA_000001405.15_GRCh38_no_alt_analysis_set.dict") urls+=("5fddbc109c82980f9436aa5c21a57c61" "resources/GRCh38/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.fai") urls+=("aab53048116f541b7aeef2da1c3e4ae7" "resources/GRCh38/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.gz") @@ -147,7 +149,7 @@ download_files() { # update utils/install.sh when updating inheritance.tsv urls+=("519185b8b3b7688b9e99339d4045e3f0" "resources/inheritance_20241211.tsv") urls+=("7138e76a38d6f67935699d06082ecacf" "resources/vep/cache/homo_sapiens_refseq_vep_111_GRCh38.tar.gz") - urls+=("807a26ce4a3c44c59847081980506f35" "resources/vip-report-template-v7.1.0.html") + urls+=("4023abed0bccb31bf18bde3ddd1f2ed6" "resources/vip-report-template-v7.1.1.html") # when modifying urls array, please keep list in 'ls -l' order for ((i = 0; i < ${#urls[@]}; i += 2)); do download_file "${base_url}" "${urls[i+1]}" "${urls[i+0]}" "${output_dir}" "${validate}" diff --git a/utils/create_default_bed.sh b/utils/create_default_bed.sh new file mode 100644 index 00000000..32a05d5b --- /dev/null +++ b/utils/create_default_bed.sh @@ -0,0 +1,183 @@ +#!/bin/bash +set -euo pipefail + +SCRIPT_NAME="$(basename "$0")" + +usage() { + echo -e "usage: ${SCRIPT_NAME} -v +create default bed file + -i, --input input gff file (e.g. https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/405/GCF_000001405.40_GRCh38.p14/GCF_000001405.40_GRCh38.p14_genomic.gff.gz) + -m, --mapping assembly report to map GFF contig identifiers to VCF contig identifiers(e.g https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/405/GCF_000001405.40_GRCh38.p14/GCF_000001405.40_GRCh38.p14_assembly_report.txt) + -t, --types comma separated list of types from the gff that should be included + -s, --sources which sources to include, possible values: BestRefSeq,RefSeq,RefSeqFE,Gnomon,cmsearch,Curated Genomic,tRNAscan-SE + -o, --output output bed file file/location + -h, --help Print this message and exit" +} + +create() { + local -r input="${1}" + local -r mapping="${2}" + local -r types="${3}" + local -r sources="${4}" + local -r output="${5}" + + zcat "${input}" | awk -v mapfile="${mapping}" -v types_input="${types}" -v sources_input="${sources}" ' + BEGIN { + FS = OFS = "\t"; + # Load mapping file + while ((getline line < mapfile) > 0) { + if (line ~ /^#/) continue; + split(line, fields, "\t"); + contig_map[fields[7]] = gsub(/[\r\n]+/, "", fields[10]);; + } + close(mapfile); + + #input types and sources to map for easier use + n = split(types_input, included_types, ","); + for (i = 1; i <= n; i++) { + types[included_types[i]] = 1; + } + n = split(sources_input, included_sources, ","); + for (i = 1; i <= n; i++) { + sources[included_sources[i]] = 1; + } + } + { + #check if any of the sources match any of the input sources + n = split($2, sources_split, "%2C"); + include = 0; + for (i = 1; i <= n; i++) { + source = sources_split[i] + if(sources[source]){ + include = 1; + break; + } + } + #check if the line has a type that should be included + if ($3 in types && include == 1) { + split($9, fields, ";"); + id = ""; + for (i in fields) { + split(fields[i], annotation_map, "="); + if (annotation_map[1] == "ID") { + id = annotation_map[2]; + break; + } + } + if ($1 in contig_map) { + new_contig = contig_map[$1]; + print new_contig, $4, $5, id; + } else { + print "Error: Unknown Contig " $1 " encountered." > "/dev/stderr"; + exit 1; + } + } + }' > "${output}" +} + +validate() { + local -r input="${1}" + local -r classification="${2}" + local -r types="${3}" + local -r sources="${4}" + local -r output="${5}" + + # input + if [[ -z "${input}" ]]; then + echo -e "missing required -i, --input" + exit 1 + fi + if [[ ! -f "${input}" ]]; then + echo -e "-i, --input '${input}' does not exist" + exit 1 + fi + if [[ "${input}" != *.gff.gz ]]; then + echo -e "-i, --input '${input}' is not a '.gff.gz' file" + exit 1 + fi + + #output + if [[ "${output}" != *.bed ]]; then + echo -e "-o, --output '${output}' is not a '.bed' file" + exit 1 + fi + if [[ -f "${output}" ]]; then + echo -e "-o, --output '${output}' already exists" + exit 1 + fi + + #mapping + if [[ -z "${mapping}" ]]; then + echo -e "missing required -m, --mapping" + exit 1 + fi + if [[ "${mapping}" != *.txt ]]; then + echo -e "-m, --mapping '${mapping}' is not a '.txt' file" + exit 1 + fi + + #sources + if [[ -z "${sources}" ]]; then + echo -e "missing required -s, --sources" + exit 1 + fi + #types + if [[ -z "${types}" ]]; then + echo -e "missing required -t, --types" + exit 1 + fi +} + +main() { + local -r args=$(getopt -a -n pipeline -o i:o:m:t:s:h --long input:,mapping:,types:,sources:,output:,help -- "$@") + # shellcheck disable=SC2181 + if [[ $? != 0 ]]; then + usage + exit 2 + fi + + local version="" + + eval set -- "${args}" + while :; do + case "$1" in + -h | --help) + usage + exit 0 + ;; + -i | --input) + input="$2" + shift 2 + ;; + -m | --mapping) + mapping="$2" + shift 2 + ;; + -t | --types) + types="$2" + shift 2 + ;; + -s | --sources) + sources="$2" + shift 2 + ;; + -o | --output) + output="$2" + shift 2 + ;; + --) + shift + break + ;; + *) + usage + exit 2 + ;; + esac + done + + validate "${input}" "${mapping}" "${types}" "${sources}" "${output}" + create "${input}" "${mapping}" "${types}" "${sources}" "${output}" +} + +main "${@}" diff --git a/vip_cram.nf b/vip_cram.nf index 4595ff3b..9bb94b28 100644 --- a/vip_cram.nf +++ b/vip_cram.nf @@ -32,8 +32,7 @@ workflow cram { // coverage ch_cram_multi.coverage - | filter { meta -> meta.project.regions != null } - | map { meta -> [meta, meta.sample.cram.data, meta.sample.cram.index, meta.project.regions] } + | map { meta -> [meta, meta.sample.cram.data, meta.sample.cram.index, meta.project.regions ? meta.project.regions : meta.project.sequencing_method == "WES" ? params.cram.coverage[meta.project.assembly].default_bed_exon : params.cram.coverage[meta.project.assembly].default_bed_gene ] } | coverage // snv