From 63248edcf8c07c0433e2375000d9722526a8b9ae Mon Sep 17 00:00:00 2001
From: Thomas Mooney <tmooney@genome.wustl.edu>
Date: Tue, 13 Dec 2022 16:19:29 -0600
Subject: [PATCH] Use sambamba markdup for duplicate marking.

We weren't using the barcode tag feature in these pipelines, but if we
were we could bring back a picard version.
---
 definitions/pipelines/rnaseq.cwl              |  2 --
 definitions/pipelines/rnaseq_star_fusion.cwl  |  2 --
 .../rnaseq_star_fusion_with_xenosplit.cwl     |  2 --
 definitions/tools/generate_fda_tables.cwl     | 24 +++++++++++++++----
 .../tools/mark_duplicates_and_sort.cwl        | 21 ++++++----------
 5 files changed, 27 insertions(+), 24 deletions(-)

diff --git a/definitions/pipelines/rnaseq.cwl b/definitions/pipelines/rnaseq.cwl
index c42f23a94..5961d59c2 100644
--- a/definitions/pipelines/rnaseq.cwl
+++ b/definitions/pipelines/rnaseq.cwl
@@ -150,8 +150,6 @@ steps:
         run: ../tools/mark_duplicates_and_sort.cwl
         in:
             bam: index_bam/indexed_bam
-            input_sort_order: 
-                default: "coordinate"
         out:
             [sorted_bam, metrics_file]
     stringtie:
diff --git a/definitions/pipelines/rnaseq_star_fusion.cwl b/definitions/pipelines/rnaseq_star_fusion.cwl
index a9170cb5c..789e68065 100644
--- a/definitions/pipelines/rnaseq_star_fusion.cwl
+++ b/definitions/pipelines/rnaseq_star_fusion.cwl
@@ -238,8 +238,6 @@ steps:
         run: ../tools/mark_duplicates_and_sort.cwl
         in:
             bam: sort_bam/sorted_bam
-            input_sort_order:
-                default: "coordinate"
         out:
             [sorted_bam, metrics_file]
     index_bam:
diff --git a/definitions/pipelines/rnaseq_star_fusion_with_xenosplit.cwl b/definitions/pipelines/rnaseq_star_fusion_with_xenosplit.cwl
index 1537d435b..9d47f552c 100644
--- a/definitions/pipelines/rnaseq_star_fusion_with_xenosplit.cwl
+++ b/definitions/pipelines/rnaseq_star_fusion_with_xenosplit.cwl
@@ -257,8 +257,6 @@ steps:
         run: ../tools/mark_duplicates_and_sort.cwl
         in:
             bam: sort_bam/sorted_bam
-            input_sort_order:
-                default: "coordinate"
         out:
             [sorted_bam, metrics_file]
     index_bam:
diff --git a/definitions/tools/generate_fda_tables.cwl b/definitions/tools/generate_fda_tables.cwl
index 93ca2421b..d81dc714d 100644
--- a/definitions/tools/generate_fda_tables.cwl
+++ b/definitions/tools/generate_fda_tables.cwl
@@ -4,7 +4,7 @@ class: CommandLineTool
 label: "Script to create FDA-requested summary tables"
 requirements:
     - class: DockerRequirement
-      dockerPull: "python:3.7.4-slim-buster"
+      dockerPull: "python:3.10.8-slim-buster"
     - class: ResourceRequirement
       ramMin: 8000
     - class: InitialWorkDirRequirement
@@ -232,9 +232,25 @@ requirements:
 
             def parse_duplication_metrics(duplication_metrics):
                 with open(duplication_metrics) as f:
-                    raw_chunk = f.read().split('\n\n')[1]
-                pct_dup = raw_chunk.splitlines()[2].split('\t')[8]
-                return {'PERCENT_DUPLICATION': pct_dup}
+                    pairs = None
+                    singles = None
+                    duplicates = None
+                    lines = f.read().splitlines()
+                    for line in lines:
+                        if match_pairs := re.search(r'sorted (\d+) end pairs', line):
+                            pairs = match_pairs.group(1)
+                        elif match_singles := re.search(r'and (\d+) single ends', line):
+                            singles = match_singles.group(1)
+                        elif match_duplicates := re.search(r'found (\d+) duplicates', line):
+                            duplicates = match_duplicates.group(1)
+                    if pairs is None:
+                        raise ValueError('Failed to parse number of end pairs')
+                    if singles is None:
+                        raise ValueError('Failed to parse number of single ends')
+                    if duplicates is None:
+                        raise ValueError('Failed to parse number of duplicates')
+
+                    return {'PERCENT_DUPLICATION': str(float(duplicates)/(2.0*float(pairs) + float(singles))*100.0)}
 
             def parse_insert_size_metrics(insert_size_metrics):
                 with open(insert_size_metrics) as f:
diff --git a/definitions/tools/mark_duplicates_and_sort.cwl b/definitions/tools/mark_duplicates_and_sort.cwl
index 79097442c..13a0f85d6 100644
--- a/definitions/tools/mark_duplicates_and_sort.cwl
+++ b/definitions/tools/mark_duplicates_and_sort.cwl
@@ -7,10 +7,10 @@ label: "Mark duplicates and Sort"
 baseCommand: ["/bin/bash", "markduplicates_helper.sh"]
 requirements:
     - class: ResourceRequirement
-      coresMin: 8
+      coresMin: 16
       ramMin: 40000
     - class: DockerRequirement
-      dockerPull: "mgibio/mark_duplicates-cwl:1.0.1"
+      dockerPull: "quay.io/biocontainers/sambamba:0.8.2--h98b6b92_2"
     - class: InitialWorkDirRequirement
       listing:
       - entryname: 'markduplicates_helper.sh'
@@ -18,13 +18,11 @@ requirements:
             set -o pipefail
             set -o errexit
 
-            declare MD_BARCODE_TAG
-            if [ ! -z "$6" ]; then
-              MD_BARCODE_TAG="BARCODE_TAG=$6"
-            /usr/bin/java -Xmx16g -jar /opt/picard/picard.jar MarkDuplicates I=$1 O=/dev/stdout ASSUME_SORT_ORDER=$5 METRICS_FILE=$4 QUIET=true COMPRESSION_LEVEL=0 VALIDATION_STRINGENCY=LENIENT "$MD_BARCODE_TAG" | /usr/bin/sambamba sort -t $2 -m 18G -o $3 /dev/stdin
-            else
-              /usr/bin/java -Xmx16g -jar /opt/picard/picard.jar MarkDuplicates I=$1 O=/dev/stdout ASSUME_SORT_ORDER=$5 METRICS_FILE=$4 QUIET=true COMPRESSION_LEVEL=0 VALIDATION_STRINGENCY=LENIENT | /usr/bin/sambamba sort -t $2 -m 18G -o $3 /dev/stdin
-            fi
+            CORES="$2"
+            CORES_PER_JOB=`perl -E 'my $x = int($ARGV[0]/2); say($x < 1? 1 : $x)'` $CORES
+
+            sambamba markdup -l 0 -t $CORES_PER_JOB "$1" /dev/stdout 2> "$4" \
+              | sambamba sort -t $CORES_PER_JOB -m 16G -o "$3" /dev/stdin
 arguments:
     - position: 2
       valueFrom: "$(runtime.cores)"
@@ -35,11 +33,6 @@ inputs:
         type: File
         inputBinding:
             position: 1
-    input_sort_order:
-        type: string
-        default: "queryname"
-        inputBinding:
-            position: 5
     output_name:
         type: string?
         default: 'MarkedSorted.bam'