molgenis · bartcharbon · Jan 10, 2025 · Jan 14, 2025
diff --git a/config/nxf_cram.config b/config/nxf_cram.config
@@ -10,6 +10,13 @@ params {
     call_str = true
     call_sv = true
     call_cnv = true
+
+    coverage {
+      GRCh38 {
+        default_bed_exon = "${projectDir}/resources/GRCh38/default_exon.bed"
+        default_bed_gene = "${projectDir}/resources/GRCh38/default_gene.bed"
+      }
+    }
   }
 }
 

diff --git a/config/nxf_vcf.config b/config/nxf_vcf.config
@@ -111,7 +111,7 @@ params {
       include_crams = true
       max_records = ""
       max_samples = ""
-      template = "${projectDir}/resources/vip-report-template-v7.1.0.html"
+      template = "${projectDir}/resources/vip-report-template-v7.1.1.html"
       config = "${projectDir}/resources/vcf_report_config.json"
 			metadata = "${projectDir}/resources/field_metadata.json"
 

diff --git a/install.sh b/install.sh
@@ -99,6 +99,8 @@ download_files() {
   fi
   urls+=("55d49c8d95ffc9aee2ec584359c197d2" "resources/GRCh38/AlphScore_final_20230825_stripped_GRCh38.tsv.gz")
   urls+=("c6178d80393254789ebf9c43df6f2d6f" "resources/GRCh38/AlphScore_final_20230825_stripped_GRCh38.tsv.gz.tbi")
+  urls+=("515f004afad1c782de590730a9b057bd" "resources/GRCh38/default_exon.bed")
+  urls+=("78a5e20c2176a88ea9a689cb77c8b68d" "resources/GRCh38/default_gene.bed")
   urls+=("8e842bfe9c1eeb0943a588ff5662b9aa" "resources/GRCh38/GCA_000001405.15_GRCh38_no_alt_analysis_set.dict")
   urls+=("5fddbc109c82980f9436aa5c21a57c61" "resources/GRCh38/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.fai")
   urls+=("aab53048116f541b7aeef2da1c3e4ae7" "resources/GRCh38/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.gz")
@@ -147,7 +149,7 @@ download_files() {
   # update utils/install.sh when updating inheritance.tsv
   urls+=("519185b8b3b7688b9e99339d4045e3f0" "resources/inheritance_20241211.tsv")
   urls+=("7138e76a38d6f67935699d06082ecacf" "resources/vep/cache/homo_sapiens_refseq_vep_111_GRCh38.tar.gz")
-  urls+=("807a26ce4a3c44c59847081980506f35" "resources/vip-report-template-v7.1.0.html")
+  urls+=("4023abed0bccb31bf18bde3ddd1f2ed6" "resources/vip-report-template-v7.1.1.html")
   # when modifying urls array, please keep list in 'ls -l' order
   for ((i = 0; i < ${#urls[@]}; i += 2)); do
     download_file "${base_url}" "${urls[i+1]}" "${urls[i+0]}" "${output_dir}" "${validate}"

diff --git a/utils/create_default_bed.sh b/utils/create_default_bed.sh
@@ -0,0 +1,183 @@
+#!/bin/bash
+set -euo pipefail
+
+SCRIPT_NAME="$(basename "$0")"
+
+usage() {
+  echo -e "usage: ${SCRIPT_NAME} -v <arg>
+create default bed file
+  -i, --input    <arg>    input gff file (e.g. https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/405/GCF_000001405.40_GRCh38.p14/GCF_000001405.40_GRCh38.p14_genomic.gff.gz)
+  -m, --mapping    <arg>    assembly report to map GFF contig identifiers to VCF contig identifiers(e.g https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/405/GCF_000001405.40_GRCh38.p14/GCF_000001405.40_GRCh38.p14_assembly_report.txt)
+  -t, --types    <arg>    comma separated list of types from the gff that should be included
+  -s, --sources    <arg>    which sources to include, possible values: BestRefSeq,RefSeq,RefSeqFE,Gnomon,cmsearch,Curated Genomic,tRNAscan-SE
+  -o, --output    <arg>    output bed file file/location
+  -h, --help                Print this message and exit"
+}
+
+create() {
+	local -r input="${1}"
+	local -r mapping="${2}"
+	local -r types="${3}"
+	local -r sources="${4}"
+	local -r output="${5}"
+
+	zcat "${input}" | awk -v mapfile="${mapping}" -v types_input="${types}" -v sources_input="${sources}" '
+	BEGIN {
+		FS = OFS = "\t";
+		# Load mapping file
+		while ((getline line < mapfile) > 0) {
+			if (line ~ /^#/) continue;
+			split(line, fields, "\t");
+			contig_map[fields[7]] = gsub(/[\r\n]+/, "", fields[10]);;
+		}
+		close(mapfile);
+
+    #input types and sources to map for easier use
+		n = split(types_input, included_types, ",");
+		for (i = 1; i <= n; i++) {
+			types[included_types[i]] = 1;
+		}
+    n = split(sources_input, included_sources, ",");
+		for (i = 1; i <= n; i++) {
+			sources[included_sources[i]] = 1;
+		}
+	}
+	{
+    #check if any of the sources match any of the input sources
+    n = split($2, sources_split, "%2C");
+    include = 0;
+    for (i = 1; i <= n; i++) {
+      source = sources_split[i]
+      if(sources[source]){
+        include = 1;
+        break;
+      }
+    }
+    #check if the line has a type that should be included
+		if ($3 in types && include == 1) {
+			split($9, fields, ";");
+			id = "";
+			for (i in fields) {
+				split(fields[i], annotation_map, "=");
+				if (annotation_map[1] == "ID") {
+					id = annotation_map[2];
+					break;
+				}
+			}
+			if ($1 in contig_map) {
+				new_contig = contig_map[$1];
+				print new_contig, $4, $5, id;
+			} else {
+				print "Error: Unknown Contig " $1 " encountered." > "/dev/stderr";
+				exit 1;
+			}
+		}
+	}' > "${output}"
+}
+
+validate() {
+  local -r input="${1}"
+  local -r classification="${2}"
+	local -r types="${3}"
+	local -r sources="${4}"
+	local -r output="${5}"
+
+  # input
+  if [[ -z "${input}" ]]; then
+    echo -e "missing required -i, --input"
+    exit 1
+  fi
+  if [[ ! -f "${input}" ]]; then
+    echo -e "-i, --input '${input}' does not exist"
+    exit 1
+  fi
+  if [[ "${input}" != *.gff.gz ]]; then
+    echo -e "-i, --input '${input}' is not a '.gff.gz' file"
+    exit 1
+  fi
+
+  #output
+  if [[ "${output}" != *.bed ]]; then
+    echo -e "-o, --output '${output}' is not a '.bed' file"
+    exit 1
+  fi
+  if [[ -f "${output}" ]]; then
+    echo -e "-o, --output '${output}' already exists"
+    exit 1
+  fi
+
+  #mapping
+   if [[ -z "${mapping}" ]]; then
+    echo -e "missing required -m, --mapping"
+    exit 1
+  fi
+  if [[ "${mapping}" != *.txt ]]; then
+    echo -e "-m, --mapping '${mapping}' is not a '.txt' file"
+    exit 1
+  fi
+
+  #sources
+  if [[ -z "${sources}" ]]; then
+    echo -e "missing required -s, --sources"
+    exit 1
+  fi
+  #types
+  if [[ -z "${types}" ]]; then
+    echo -e "missing required -t, --types"
+    exit 1
+  fi
+}
+
+main() {
+  local -r args=$(getopt -a -n pipeline -o i:o:m:t:s:h --long input:,mapping:,types:,sources:,output:,help -- "$@")
+  # shellcheck disable=SC2181
+  if [[ $? != 0 ]]; then
+    usage
+    exit 2
+  fi
+
+  local version=""
+
+  eval set -- "${args}"
+  while :; do
+    case "$1" in
+    -h | --help)
+      usage
+      exit 0
+      ;;
+    -i | --input)
+      input="$2"
+      shift 2
+      ;;
+	-m | --mapping)
+      mapping="$2"
+      shift 2
+      ;;
+	-t | --types)
+      types="$2"
+      shift 2
+      ;;
+  -s | --sources)
+      sources="$2"
+      shift 2
+      ;;
+	-o | --output)
+      output="$2"
+      shift 2
+      ;;
+    --)
+      shift
+      break
+      ;;
+    *)
+      usage
+      exit 2
+      ;;
+    esac
+  done
+
+  validate "${input}" "${mapping}" "${types}" "${sources}" "${output}"
+  create "${input}" "${mapping}" "${types}" "${sources}" "${output}"
+}
+
+main "${@}"
diff --git a/vip_cram.nf b/vip_cram.nf
@@ -32,8 +32,7 @@ workflow cram {
 
 		// coverage
 		ch_cram_multi.coverage
-      | filter { meta -> meta.project.regions != null }
-		  | map { meta -> [meta, meta.sample.cram.data, meta.sample.cram.index, meta.project.regions] }
+		  | map { meta -> [meta, meta.sample.cram.data, meta.sample.cram.index, meta.project.regions ? meta.project.regions : meta.project.sequencing_method == "WES" ? params.cram.coverage[meta.project.assembly].default_bed_exon : params.cram.coverage[meta.project.assembly].default_bed_gene ] }
       | coverage
 
     // snv