From 63248edcf8c07c0433e2375000d9722526a8b9ae Mon Sep 17 00:00:00 2001 From: Thomas Mooney Date: Tue, 13 Dec 2022 16:19:29 -0600 Subject: [PATCH] Use sambamba markdup for duplicate marking. We weren't using the barcode tag feature in these pipelines, but if we were we could bring back a picard version. --- definitions/pipelines/rnaseq.cwl | 2 -- definitions/pipelines/rnaseq_star_fusion.cwl | 2 -- .../rnaseq_star_fusion_with_xenosplit.cwl | 2 -- definitions/tools/generate_fda_tables.cwl | 24 +++++++++++++++---- .../tools/mark_duplicates_and_sort.cwl | 21 ++++++---------- 5 files changed, 27 insertions(+), 24 deletions(-) diff --git a/definitions/pipelines/rnaseq.cwl b/definitions/pipelines/rnaseq.cwl index c42f23a94..5961d59c2 100644 --- a/definitions/pipelines/rnaseq.cwl +++ b/definitions/pipelines/rnaseq.cwl @@ -150,8 +150,6 @@ steps: run: ../tools/mark_duplicates_and_sort.cwl in: bam: index_bam/indexed_bam - input_sort_order: - default: "coordinate" out: [sorted_bam, metrics_file] stringtie: diff --git a/definitions/pipelines/rnaseq_star_fusion.cwl b/definitions/pipelines/rnaseq_star_fusion.cwl index a9170cb5c..789e68065 100644 --- a/definitions/pipelines/rnaseq_star_fusion.cwl +++ b/definitions/pipelines/rnaseq_star_fusion.cwl @@ -238,8 +238,6 @@ steps: run: ../tools/mark_duplicates_and_sort.cwl in: bam: sort_bam/sorted_bam - input_sort_order: - default: "coordinate" out: [sorted_bam, metrics_file] index_bam: diff --git a/definitions/pipelines/rnaseq_star_fusion_with_xenosplit.cwl b/definitions/pipelines/rnaseq_star_fusion_with_xenosplit.cwl index 1537d435b..9d47f552c 100644 --- a/definitions/pipelines/rnaseq_star_fusion_with_xenosplit.cwl +++ b/definitions/pipelines/rnaseq_star_fusion_with_xenosplit.cwl @@ -257,8 +257,6 @@ steps: run: ../tools/mark_duplicates_and_sort.cwl in: bam: sort_bam/sorted_bam - input_sort_order: - default: "coordinate" out: [sorted_bam, metrics_file] index_bam: diff --git a/definitions/tools/generate_fda_tables.cwl b/definitions/tools/generate_fda_tables.cwl index 93ca2421b..d81dc714d 100644 --- a/definitions/tools/generate_fda_tables.cwl +++ b/definitions/tools/generate_fda_tables.cwl @@ -4,7 +4,7 @@ class: CommandLineTool label: "Script to create FDA-requested summary tables" requirements: - class: DockerRequirement - dockerPull: "python:3.7.4-slim-buster" + dockerPull: "python:3.10.8-slim-buster" - class: ResourceRequirement ramMin: 8000 - class: InitialWorkDirRequirement @@ -232,9 +232,25 @@ requirements: def parse_duplication_metrics(duplication_metrics): with open(duplication_metrics) as f: - raw_chunk = f.read().split('\n\n')[1] - pct_dup = raw_chunk.splitlines()[2].split('\t')[8] - return {'PERCENT_DUPLICATION': pct_dup} + pairs = None + singles = None + duplicates = None + lines = f.read().splitlines() + for line in lines: + if match_pairs := re.search(r'sorted (\d+) end pairs', line): + pairs = match_pairs.group(1) + elif match_singles := re.search(r'and (\d+) single ends', line): + singles = match_singles.group(1) + elif match_duplicates := re.search(r'found (\d+) duplicates', line): + duplicates = match_duplicates.group(1) + if pairs is None: + raise ValueError('Failed to parse number of end pairs') + if singles is None: + raise ValueError('Failed to parse number of single ends') + if duplicates is None: + raise ValueError('Failed to parse number of duplicates') + + return {'PERCENT_DUPLICATION': str(float(duplicates)/(2.0*float(pairs) + float(singles))*100.0)} def parse_insert_size_metrics(insert_size_metrics): with open(insert_size_metrics) as f: diff --git a/definitions/tools/mark_duplicates_and_sort.cwl b/definitions/tools/mark_duplicates_and_sort.cwl index 79097442c..13a0f85d6 100644 --- a/definitions/tools/mark_duplicates_and_sort.cwl +++ b/definitions/tools/mark_duplicates_and_sort.cwl @@ -7,10 +7,10 @@ label: "Mark duplicates and Sort" baseCommand: ["/bin/bash", "markduplicates_helper.sh"] requirements: - class: ResourceRequirement - coresMin: 8 + coresMin: 16 ramMin: 40000 - class: DockerRequirement - dockerPull: "mgibio/mark_duplicates-cwl:1.0.1" + dockerPull: "quay.io/biocontainers/sambamba:0.8.2--h98b6b92_2" - class: InitialWorkDirRequirement listing: - entryname: 'markduplicates_helper.sh' @@ -18,13 +18,11 @@ requirements: set -o pipefail set -o errexit - declare MD_BARCODE_TAG - if [ ! -z "$6" ]; then - MD_BARCODE_TAG="BARCODE_TAG=$6" - /usr/bin/java -Xmx16g -jar /opt/picard/picard.jar MarkDuplicates I=$1 O=/dev/stdout ASSUME_SORT_ORDER=$5 METRICS_FILE=$4 QUIET=true COMPRESSION_LEVEL=0 VALIDATION_STRINGENCY=LENIENT "$MD_BARCODE_TAG" | /usr/bin/sambamba sort -t $2 -m 18G -o $3 /dev/stdin - else - /usr/bin/java -Xmx16g -jar /opt/picard/picard.jar MarkDuplicates I=$1 O=/dev/stdout ASSUME_SORT_ORDER=$5 METRICS_FILE=$4 QUIET=true COMPRESSION_LEVEL=0 VALIDATION_STRINGENCY=LENIENT | /usr/bin/sambamba sort -t $2 -m 18G -o $3 /dev/stdin - fi + CORES="$2" + CORES_PER_JOB=`perl -E 'my $x = int($ARGV[0]/2); say($x < 1? 1 : $x)'` $CORES + + sambamba markdup -l 0 -t $CORES_PER_JOB "$1" /dev/stdout 2> "$4" \ + | sambamba sort -t $CORES_PER_JOB -m 16G -o "$3" /dev/stdin arguments: - position: 2 valueFrom: "$(runtime.cores)" @@ -35,11 +33,6 @@ inputs: type: File inputBinding: position: 1 - input_sort_order: - type: string - default: "queryname" - inputBinding: - position: 5 output_name: type: string? default: 'MarkedSorted.bam'