diff --git a/src/main/java/picard/analysis/HistogramGenerator.java b/src/main/java/picard/analysis/HistogramGenerator.java new file mode 100644 index 0000000000..4d5479db19 --- /dev/null +++ b/src/main/java/picard/analysis/HistogramGenerator.java @@ -0,0 +1,231 @@ +/* + * The MIT License + * + * Copyright (c) 2022 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +package picard.analysis; + +import htsjdk.samtools.SAMRecord; +import htsjdk.samtools.util.Histogram; +import htsjdk.samtools.util.QualityUtil; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +final class HistogramGenerator { + final boolean useOriginalQualities; + int maxLengthSoFar = 0; + double[] firstReadTotalsByCycle = new double[maxLengthSoFar]; + double[] firstReadTotalProbsByCycle = new double[maxLengthSoFar]; + long[] firstReadCountsByCycle = new long[maxLengthSoFar]; + double[] secondReadTotalsByCycle = new double[maxLengthSoFar]; + double[] secondReadTotalProbsByCycle = new double[maxLengthSoFar]; + long[] secondReadCountsByCycle = new long[maxLengthSoFar]; + int recordsCount = 0; + final public int skipBases = 10; //qualities of the first bases are ignored (inference issues) + final public int minimalCount = 25; + + public HistogramGenerator(final boolean useOriginalQualities) { + this.useOriginalQualities = useOriginalQualities; + } + + public HistogramGenerator(final double[] firstReadTotalsByCycle, final double[] firstReadTotalProbsByCycle, + final long[] firstReadCountsByCycle, final double[] secondReadTotalsByCycle, + final double[] secondReadTotalProbsByCycle, final long[] secondReadCountsByCycle, int nRecords){ + this.firstReadCountsByCycle = firstReadCountsByCycle.clone(); + this.firstReadTotalsByCycle = firstReadTotalsByCycle.clone(); + this.firstReadTotalProbsByCycle = firstReadTotalProbsByCycle.clone(); + this.secondReadCountsByCycle = secondReadCountsByCycle.clone(); + this.secondReadTotalsByCycle = secondReadTotalsByCycle.clone(); + this.secondReadTotalProbsByCycle = secondReadTotalProbsByCycle.clone(); + this.useOriginalQualities = false; + this.recordsCount = nRecords; + } + + public void addRecord(final SAMRecord rec) { + final byte[] quals = (useOriginalQualities ? rec.getOriginalBaseQualities() : rec.getBaseQualities()); + if (quals == null) return; + recordsCount++; + final int length = quals.length; + final boolean rc = rec.getReadNegativeStrandFlag(); + ensureArraysBigEnough(length+1); + + for (int i=0; i result = new ArrayList<>(); + List weights = new ArrayList<>(); + double[] accumulator; + long[] counts; + if (readInPair == 1){ + accumulator = firstReadTotalProbsByCycle; + counts = firstReadCountsByCycle; + } else { + accumulator = secondReadTotalProbsByCycle; + counts = secondReadCountsByCycle; + } + for (int i = skipBases; i < accumulator.length; i++ ){ + if (counts[i] < minimalCount){ + break; + } + result.add(accumulator[i]/counts[i]); + weights.add(counts[i]); + } + applySpanningWindowMean(result, weights, spanningWindowLength); + return longestHighQuality(result,errorProbThreshold); + } + + private void applySpanningWindowMean(List vector, List weights, final int spanLength){ + List tmp = new ArrayList<>(vector); + for (int i = 0; i < vector.size(); i++){ + double tmpEr = 0; + long tmpWeight = 0; + for (int j = Math.max(i-spanLength,0); j < Math.min(i+spanLength+1, vector.size()); j++){ + tmpEr += tmp.get(j)*weights.get(j); + tmpWeight += weights.get(j); + } + vector.set(i, tmpEr/tmpWeight); + } + } + + private int longestHighQuality(List averageErrorProbabilities, double errorProbThreshold){ + int curStart = 0; + int curEnd = 0; + int curBestIntervalLength = 0; + + while ( curEnd < averageErrorProbabilities.size() ) { + if (averageErrorProbabilities.get(curEnd) <= errorProbThreshold) { + curEnd++; + } else { + if ((curEnd - curStart) > curBestIntervalLength) { + curBestIntervalLength = curEnd - curStart; + } + curStart = curEnd + 1; + curEnd = curStart; + } + } + if ((curEnd - curStart) > curBestIntervalLength) { + curBestIntervalLength = curEnd - curStart; + } + return curBestIntervalLength; + } + + boolean isEmpty() { + return maxLengthSoFar == 0; + } + + + private void ensureArraysBigEnough(final int length) { + if (length > maxLengthSoFar) { + firstReadTotalsByCycle = Arrays.copyOf(firstReadTotalsByCycle, length); + firstReadTotalProbsByCycle = Arrays.copyOf(firstReadTotalProbsByCycle, length); + firstReadCountsByCycle = Arrays.copyOf(firstReadCountsByCycle, length); + secondReadTotalsByCycle = Arrays.copyOf(secondReadTotalsByCycle , length); + secondReadTotalProbsByCycle = Arrays.copyOf(secondReadTotalProbsByCycle , length); + secondReadCountsByCycle = Arrays.copyOf(secondReadCountsByCycle, length); + maxLengthSoFar = length; + } + } + + Histogram getMeanQualityHistogram() { + final String label = useOriginalQualities ? "MEAN_ORIGINAL_QUALITY" : "MEAN_QUALITY"; + final Histogram meanQualities = new Histogram("CYCLE", label); + + int firstReadLength = 0; + + for (int cycle=0; cycle < firstReadTotalsByCycle.length; ++cycle) { + if (firstReadTotalsByCycle[cycle] > 0) { + meanQualities.increment(cycle, firstReadTotalsByCycle[cycle] / firstReadCountsByCycle[cycle]); + firstReadLength = cycle; + } + } + + for (int i=0; i< secondReadTotalsByCycle.length; ++i) { + if (secondReadCountsByCycle[i] > 0) { + final int cycle = firstReadLength + i; + meanQualities.increment(cycle, secondReadTotalsByCycle[i] / secondReadCountsByCycle[i]); + } + } + + return meanQualities; + } + + Histogram getMeanErrorProbHistogram() { + final String label = useOriginalQualities ? "MEAN_ORIGINAL_ERROR_PROB" : "MEAN_ERROR_PROB"; + final Histogram meanQualities = new Histogram("CYCLE", label); + + int firstReadLength = 0; + + for (int cycle=0; cycle < firstReadTotalsByCycle.length; ++cycle) { + if (firstReadTotalsByCycle[cycle] > 0) { + meanQualities.increment(cycle, firstReadTotalProbsByCycle[cycle] / firstReadCountsByCycle[cycle]); + firstReadLength = cycle; + } + } + + for (int i=0; i< secondReadTotalsByCycle.length; ++i) { + if (secondReadCountsByCycle[i] > 0) { + final int cycle = firstReadLength + i; + meanQualities.increment(cycle, secondReadTotalProbsByCycle[i] / secondReadCountsByCycle[i]); + } + } + + return meanQualities; + } + +} diff --git a/src/main/java/picard/sam/DuplicationMetricsFactory.java b/src/main/java/picard/sam/DuplicationMetricsFactory.java new file mode 100644 index 0000000000..ad7299fb72 --- /dev/null +++ b/src/main/java/picard/sam/DuplicationMetricsFactory.java @@ -0,0 +1,19 @@ +package picard.sam; + +public class DuplicationMetricsFactory { + + // create a DuplicationMetrics for a specific read group + public static DuplicationMetrics createMetrics(final boolean flowMetrics) { + + // create based on the presence of flow order + if ( !flowMetrics ) { + return new DuplicationMetrics(); + } else { + return new FlowBasedDuplicationMetrics(); + } + } + + public static DuplicationMetrics createMetrics() { + return new DuplicationMetrics(); + } +} diff --git a/src/main/java/picard/sam/FlowBasedDuplicationMetrics.java b/src/main/java/picard/sam/FlowBasedDuplicationMetrics.java new file mode 100644 index 0000000000..ed25731c07 --- /dev/null +++ b/src/main/java/picard/sam/FlowBasedDuplicationMetrics.java @@ -0,0 +1,74 @@ +package picard.sam; + +import htsjdk.samtools.SAMRecord; +import picard.sam.markduplicates.util.AbstractMarkDuplicatesCommandLineProgram; +import picard.util.MathUtil; + +public class FlowBasedDuplicationMetrics extends DuplicationMetrics { + + /* + * count of single end reads where the exact fragment length is known (i.e. clipped) + */ + @MergeByAdding + public long UNPAIRED_WITH_TLEN; + + /* + * count of single end duplicates where the exact fragment length is + * unknown (quality trimmed, not clipped) + */ + @MergeByAdding + public long UNPAIRED_DUPS_WITHOUT_TLEN; + + /* + * count of duplicates where both ends are known + */ + @MergeByAdding + public long UNPAIRED_DUPS_WITH_TLEN; + + /** + * The fraction of duplicated reads out of all reads with exact + * fragment length unknown + */ + @NoMergingIsDerived + public Double UNPAIRED_DUP_RATE_WITHOUT_TLEN; + + /** + * The fraction of duplicated reads out of all reads with exact fragment + * length known + */ + @NoMergingIsDerived + public Double UNPAIRED_DUP_RATE_WITH_TLEN; + + + @Override + public void calculateDerivedFields() { + super.calculateDerivedFields(); + + UNPAIRED_DUP_RATE_WITHOUT_TLEN = MathUtil.divide(UNPAIRED_DUPS_WITHOUT_TLEN, UNPAIRED_READS_EXAMINED - UNPAIRED_WITH_TLEN); + UNPAIRED_DUP_RATE_WITH_TLEN = MathUtil.divide(UNPAIRED_DUPS_WITH_TLEN, UNPAIRED_WITH_TLEN); + } + + public void addDuplicateReadToMetrics(final SAMRecord rec) { + super.addDuplicateReadToMetrics(rec); + + if (!rec.isSecondaryOrSupplementary() && !rec.getReadUnmappedFlag()) { + if (!rec.getReadPairedFlag() || rec.getMateUnmappedFlag()) { + if ( AbstractMarkDuplicatesCommandLineProgram.isSingleEndReadKnownFragment(rec) ) { + ++UNPAIRED_DUPS_WITH_TLEN; + } else { + ++UNPAIRED_DUPS_WITHOUT_TLEN; + } + } + } + } + + public void addReadToLibraryMetrics(final SAMRecord rec) { + + super.addReadToLibraryMetrics(rec); + + if (AbstractMarkDuplicatesCommandLineProgram.isSingleEndReadKnownFragment(rec)) { + ++UNPAIRED_WITH_TLEN; + } + } + +} diff --git a/src/main/java/picard/sam/markduplicates/MarkDuplicatesForFlowArgumentCollection.java b/src/main/java/picard/sam/markduplicates/MarkDuplicatesForFlowArgumentCollection.java new file mode 100644 index 0000000000..2e51a46e6f --- /dev/null +++ b/src/main/java/picard/sam/markduplicates/MarkDuplicatesForFlowArgumentCollection.java @@ -0,0 +1,41 @@ +package picard.sam.markduplicates; + +import org.broadinstitute.barclay.argparser.Argument; + +public class MarkDuplicatesForFlowArgumentCollection { + + @Argument(doc = "enable parameters and behavior specific to flow based reads.", optional = true) + public boolean FLOW_MODE = false; + + @Argument(doc = "Use specific quality summing strategy for flow based reads. The strategy ensures that the same " + + "(and correct) quality value is used for all bases of the same homopolymer.", optional = true) + public boolean FLOW_QUALITY_SUM_STRATEGY = false; + + @Argument(doc = "Make the end location of single end read be significant when considering duplicates, " + + "in addition to the start location, which is always significant (i.e. require single-ended reads to start and" + + "end on the same position to be considered duplicate) " + + "(for this argument, \"read end\" means 3' end).", optional = true) + public boolean USE_END_IN_UNPAIRED_READS = false; + + @Argument(doc = "Use position of the clipping as the end position, when considering duplicates (or use the unclipped end position) " + + "(for this argument, \"read end\" means 3' end).", optional = true) + public boolean USE_UNPAIRED_CLIPPED_END = false; + + @Argument(doc = "Maximal difference of the read end position that counted as equal. Useful for flow based " + + "reads where the end position might vary due to sequencing errors. " + + "(for this argument, \"read end\" means 3' end)", optional = true) + public int UNPAIRED_END_UNCERTAINTY = 0; + + @Argument(doc = "Skip first N flows, starting from the read's start, when considering duplicates. Useful for flow based reads where sometimes there " + + "is noise in the first flows " + + "(for this argument, \"read start\" means 5' end).", optional = true) + public int FLOW_SKIP_FIRST_N_FLOWS = 0; + + @Argument(doc = "Treat position of read trimming based on quality as the known end (relevant for flow based reads). Default false - if the read " + + "is trimmed on quality its end is not defined and the read is duplicate of any read starting at the same place.", optional = true) + public boolean FLOW_Q_IS_KNOWN_END = false; + + @Argument(doc = "Threshold for considering a quality value high enough to be included when calculating FLOW_QUALITY_SUM_STRATEGY calculation.", optional = true) + public int FLOW_EFFECTIVE_QUALITY_THRESHOLD = 15; + +} diff --git a/src/main/java/picard/sam/markduplicates/MarkDuplicatesForFlowHelper.java b/src/main/java/picard/sam/markduplicates/MarkDuplicatesForFlowHelper.java new file mode 100644 index 0000000000..0a26767a76 --- /dev/null +++ b/src/main/java/picard/sam/markduplicates/MarkDuplicatesForFlowHelper.java @@ -0,0 +1,422 @@ +package picard.sam.markduplicates; + +import com.google.common.annotations.VisibleForTesting; +import htsjdk.samtools.SAMFileHeader; +import htsjdk.samtools.SAMReadGroupRecord; +import htsjdk.samtools.SAMRecord; +import htsjdk.samtools.util.Log; +import htsjdk.samtools.util.SortingCollection; +import htsjdk.samtools.util.SortingLongCollection; +import picard.sam.markduplicates.util.ReadEndsForMarkDuplicates; +import picard.sam.markduplicates.util.RepresentativeReadIndexerCodec; +import picard.sam.util.RepresentativeReadIndexer; + +import java.io.File; +import java.util.ArrayList; +import java.util.Comparator; +import java.util.List; + +/** + * MarkDuplicates calculation helper class for flow based mode + * + * The class extends the behavior of MarkDuplicates which contains the complete + * code for the non-flow based mode. When in flow mode, additional parameters + * may control the establishment of read ends (start/end) for the purpose of + * determining duplication status. Additionally, the logic used to gather reads into + * (duplicate) buckets (chunks) is enhanced with an optional mechanism of read end + * uncertainty threshold. When active, reads are considered to belong to the same chunk if + * for each read in the chunk there exists at least one other read with the uncertainty + * distance on the read end. + */ +public class MarkDuplicatesForFlowHelper implements MarkDuplicatesHelper { + + private final Log log = Log.getInstance(MarkDuplicatesForFlowHelper.class); + + private static final int END_INSIGNIFICANT_VALUE = 0; + private static final String ATTR_DUPLICATE_SCORE = "ForFlowDuplicateScore"; + + // constants for clippingTagContainsAny + public static final String CLIPPING_TAG_NAME = "tm"; + public static final char[] CLIPPING_TAG_CONTAINS_A = {'A'}; + public static final char[] CLIPPING_TAG_CONTAINS_AQ = {'A', 'Q'}; + public static final char[] CLIPPING_TAG_CONTAINS_QZ = {'Q', 'Z'}; + + // instance of hosting MarkDuplicates + private final MarkDuplicates md; + + public MarkDuplicatesForFlowHelper(final MarkDuplicates md) { + this.md = md; + + validateFlowParameteres(); + } + + private void validateFlowParameteres() { + + if ( md.flowBasedArguments.UNPAIRED_END_UNCERTAINTY != 0 && !md.flowBasedArguments.USE_END_IN_UNPAIRED_READS ) { + throw new IllegalArgumentException("invalid parameter combination. UNPAIRED_END_UNCERTAINTY can not be specified when USE_END_IN_UNPAIRED_READS not specified"); + } + } + + /** + * This method is identical in function to generateDuplicateIndexes except that it accomodates for + * the possible significance of the end side of the reads (w/ or w/o uncertainty). This is only + * applicable for flow mode invocation. + */ + public void generateDuplicateIndexes(final boolean useBarcodes, final boolean indexOpticalDuplicates) { + final int entryOverhead; + if (md.TAG_DUPLICATE_SET_MEMBERS) { + // Memory requirements for RepresentativeReadIndexer: + // three int entries + overhead: (3 * 4) + 4 = 16 bytes + entryOverhead = 16; + } else { + entryOverhead = SortingLongCollection.SIZEOF; + } + // Keep this number from getting too large even if there is a huge heap. + int maxInMemory = (int) Math.min((Runtime.getRuntime().maxMemory() * 0.25) / entryOverhead, (double) (Integer.MAX_VALUE - 5)); + // If we're also tracking optical duplicates, reduce maxInMemory, since we'll need two sorting collections + if (indexOpticalDuplicates) { + maxInMemory /= ((entryOverhead + SortingLongCollection.SIZEOF) / entryOverhead); + md.opticalDuplicateIndexes = new SortingLongCollection(maxInMemory, md.TMP_DIR.toArray(new File[md.TMP_DIR.size()])); + } + log.info("Will retain up to " + maxInMemory + " duplicate indices before spilling to disk."); + md.duplicateIndexes = new SortingLongCollection(maxInMemory, md.TMP_DIR.toArray(new File[md.TMP_DIR.size()])); + if (md.TAG_DUPLICATE_SET_MEMBERS) { + final RepresentativeReadIndexerCodec representativeIndexCodec = new RepresentativeReadIndexerCodec(); + md.representativeReadIndicesForDuplicates = SortingCollection.newInstance(RepresentativeReadIndexer.class, + representativeIndexCodec, + Comparator.comparing(read -> read.readIndexInFile), + maxInMemory, + md.TMP_DIR); + } + + // this code does support pairs at this time + if ( md.pairSort.iterator().hasNext() ) { + throw new IllegalArgumentException("flow based code does not support paired reads"); + } + md.pairSort.cleanup(); + md.pairSort = null; + + /** + * Now deal with the fragments + * + * The end processing semantics depends on the following factors: + * 1. Whether the end is marked as significant (as specified by USE_END_IN_UNPAIRED_READS) + * 2. Whether end certainty is specified (by UNPAIRED_END_UNCERTAINTY) + * + * - If ends are insignificant, they are ignored + * - If ends are significant and uncertainty is set to 0 - they must be equal for fragments to be considered same + * - Otherwise, fragments are accumulated (into the same bucket) as long as they are with the + * specified uncertainty from at least one existing fragment. Note that using this strategy the effective + * range of end locations associated with fragments in a bucket may grow, but only in 'uncertainty' steps. + */ + log.info("Traversing fragment information and detecting duplicates."); + ReadEndsForMarkDuplicates firstOfNextChunk = null; + int nextChunkRead1Coordinate2Min = Integer.MAX_VALUE; + int nextChunkRead1Coordinate2Max = Integer.MIN_VALUE; + final List nextChunk = new ArrayList<>(200); + boolean containsPairs = false; + boolean containsFrags = false; + + for (final ReadEndsForMarkDuplicates next : md.fragSort) { + if (firstOfNextChunk != null && areComparableForDuplicatesWithEndSignificance(firstOfNextChunk, next, useBarcodes, + nextChunkRead1Coordinate2Min, nextChunkRead1Coordinate2Max)) { + nextChunk.add(next); + containsPairs = containsPairs || next.isPaired(); + containsFrags = containsFrags || !next.isPaired(); + if ( next.read2Coordinate != END_INSIGNIFICANT_VALUE) { + nextChunkRead1Coordinate2Min = Math.min(nextChunkRead1Coordinate2Min, next.read2Coordinate); + nextChunkRead1Coordinate2Max = Math.max(nextChunkRead1Coordinate2Max, next.read2Coordinate); + + if ( firstOfNextChunk.read2Coordinate == END_INSIGNIFICANT_VALUE) + firstOfNextChunk = next; + } + } else { + if (nextChunk.size() > 1 && containsFrags) { + md.markDuplicateFragments(nextChunk, containsPairs); + } + nextChunk.clear(); + nextChunk.add(next); + firstOfNextChunk = next; + if ( next.read2Coordinate != END_INSIGNIFICANT_VALUE) + nextChunkRead1Coordinate2Min = nextChunkRead1Coordinate2Max = next.read2Coordinate; + else { + nextChunkRead1Coordinate2Min = Integer.MAX_VALUE; + nextChunkRead1Coordinate2Max = Integer.MIN_VALUE; + } + containsPairs = next.isPaired(); + containsFrags = !next.isPaired(); + } + } + md.markDuplicateFragments(nextChunk, containsPairs); + md.fragSort.cleanup(); + md.fragSort = null; + + log.info("Sorting list of duplicate records."); + md.duplicateIndexes.doneAddingStartIteration(); + if (md.opticalDuplicateIndexes != null) { + md.opticalDuplicateIndexes.doneAddingStartIteration(); + } + if (md.TAG_DUPLICATE_SET_MEMBERS) { + md.representativeReadIndicesForDuplicates.doneAdding(); + } + } + + /** + * Builds a read ends object that represents a single read - for flow based read + */ + @Override + public ReadEndsForMarkDuplicates buildReadEnds(final SAMFileHeader header, final long index, final SAMRecord rec, final boolean useBarcodes) { + final ReadEndsForMarkDuplicates ends = md.buildReadEnds(header, index, rec, useBarcodes); + + // this code only supported unpaired reads + if (rec.getReadPairedFlag() && !rec.getMateUnmappedFlag()) { + throw new IllegalArgumentException("FLOW_MODE does not support paired reads. offending read: " + rec); + } + + // adjust start/end coordinates + ends.read1Coordinate = getReadEndCoordinate(rec, !rec.getReadNegativeStrandFlag(), true, md.flowBasedArguments); + if (md.flowBasedArguments.USE_END_IN_UNPAIRED_READS) { + ends.read2Coordinate = getReadEndCoordinate(rec, rec.getReadNegativeStrandFlag(), false, md.flowBasedArguments); + } + + // adjust score + if ( md.flowBasedArguments.FLOW_QUALITY_SUM_STRATEGY ) { + ends.score = computeFlowDuplicateScore(rec, ends.read2Coordinate); + } + + return ends; + } + + /** + * update score for pairedEnds + */ + @Override + public short getReadDuplicateScore(final SAMRecord rec, final ReadEndsForMarkDuplicates pairedEnds) { + if (md.flowBasedArguments.FLOW_QUALITY_SUM_STRATEGY ) { + return computeFlowDuplicateScore(rec, pairedEnds.read2Coordinate); + } else { + return md.getReadDuplicateScore(rec, pairedEnds); + } + } + + /** + * This method is identical in function to areComparableForDuplicates except that it accomodates for + * the possible significance of the end side of the reads (w/ or wo/ uncertainty). This is only + * applicable for flow mode invocation. + */ + private boolean areComparableForDuplicatesWithEndSignificance(final ReadEndsForMarkDuplicates lhs, final ReadEndsForMarkDuplicates rhs, final boolean useBarcodes, + final int lhsRead1Coordinate2Min, final int lhsRead1Coordinate2Max) { + boolean areComparable = md.areComparableForDuplicates(lhs, rhs, false, useBarcodes); + + if (areComparable) { + areComparable = (!endCoorSignificant(lhs.read2Coordinate, rhs.read2Coordinate) || + endCoorInRangeWithUncertainty(lhsRead1Coordinate2Min, lhsRead1Coordinate2Max, + rhs.read2Coordinate, md.flowBasedArguments.UNPAIRED_END_UNCERTAINTY)); + } + + return areComparable; + } + + private boolean endCoorSignificant(final int lhsCoor, final int rhsCoor) { + return lhsCoor != END_INSIGNIFICANT_VALUE && rhsCoor != END_INSIGNIFICANT_VALUE; + } + + private boolean endCoorInRangeWithUncertainty(final int lhsCoorMin, final int lhsCoorMax, final int rhsCoor, final int uncertainty) { + return (rhsCoor >= (lhsCoorMin - uncertainty)) && (rhsCoor <= (lhsCoorMax + uncertainty)); + } + + /** + * A quality summing scoring strategy used for flow based reads. + * + * The method walks on the bases of the read, in the synthesis direction. For each base, the effective + * quality value is defined as the value on the first base on the hmer to which the base belongs to. The score + * is defined to be the sum of all effective values above a given threshold. + * + * @param rec - SAMRecord to get a score for + * @param threshold - threshold above which effective quality is included + * @return - calculated score (see method description) + */ + static protected int getFlowSumOfBaseQualities(final SAMRecord rec, final int threshold) { + int score = 0; + + // access qualities and bases + final byte[] quals = rec.getBaseQualities(); + final byte[] bases = rec.getReadBases(); + + // create iteration range and direction + final int startingOffset = !rec.getReadNegativeStrandFlag() ? 0 : bases.length; + final int endOffset = !rec.getReadNegativeStrandFlag() ? bases.length : 0; + final int iterIncr = !rec.getReadNegativeStrandFlag() ? 1 : -1; + + // loop on bases, extract qual related to homopolymer from start of homopolymer + byte lastBase = 0; + byte effectiveQual = 0; + for ( int i = startingOffset ; i != endOffset ; i += iterIncr ) { + final byte base = bases[i]; + if ( base != lastBase ) { + effectiveQual = quals[i]; + } + if ( effectiveQual >= threshold) { + score += effectiveQual; + } + lastBase = base; + } + + return score; + } + + private short computeFlowDuplicateScore(final SAMRecord rec, final int end) { + + if ( end == END_INSIGNIFICANT_VALUE) + return -1; + + Short storedScore = (Short)rec.getTransientAttribute(ATTR_DUPLICATE_SCORE); + if ( storedScore == null ) { + short score = 0; + + score += (short) Math.min(getFlowSumOfBaseQualities(rec, md.flowBasedArguments.FLOW_EFFECTIVE_QUALITY_THRESHOLD), Short.MAX_VALUE / 2); + + score += rec.getReadFailsVendorQualityCheckFlag() ? (short) (Short.MIN_VALUE / 2) : 0; + storedScore = score; + rec.setTransientAttribute(ATTR_DUPLICATE_SCORE, storedScore); + } + + return storedScore; + } + + @VisibleForTesting + protected static int getReadEndCoordinate(final SAMRecord rec, final boolean startEnd, final boolean certain, final MarkDuplicatesForFlowArgumentCollection flowBasedArguments) { + final FlowOrder flowOrder = new FlowOrder(rec); + final int unclippedCoor = startEnd ? rec.getUnclippedStart() : rec.getUnclippedEnd(); + final int alignmentCoor = startEnd ? rec.getAlignmentStart() : rec.getAlignmentEnd(); + + // this code requires a valid flow order + if ( flowOrder.isValid() ) { + + // simple case + if ( flowBasedArguments.USE_UNPAIRED_CLIPPED_END ) { + return alignmentCoor; + } + + // "skipping" case + if (certain && flowBasedArguments.FLOW_SKIP_FIRST_N_FLOWS != 0) { + final byte[] bases = rec.getReadBases(); + byte hmerBase = startEnd ? bases[0] : bases[bases.length - 1]; + int hmersLeft = flowBasedArguments.FLOW_SKIP_FIRST_N_FLOWS; // number of hmer left to trim + + // advance flow order to base + while (flowOrder.current() != hmerBase) { + flowOrder.advance(); + hmersLeft--; + } + + int hmerSize; + for (hmerSize = 1; hmerSize < bases.length; hmerSize++) { + if ((startEnd ? bases[hmerSize] : bases[bases.length - 1 - hmerSize]) != hmerBase) { + if (--hmersLeft <= 0) { + break; + } else { + hmerBase = startEnd ? bases[hmerSize] : bases[bases.length - 1 - hmerSize]; + flowOrder.advance(); + while (flowOrder.current() != hmerBase) { + flowOrder.advance(); + hmersLeft--; + } + if (hmersLeft <= 0) { + break; + } + } + } + } + final int coor = unclippedCoor + (startEnd ? hmerSize : -hmerSize); + return flowBasedArguments.USE_UNPAIRED_CLIPPED_END + ? (startEnd ? Math.max(coor, alignmentCoor) : Math.min(coor, alignmentCoor)) + : coor; + } + + // "know end" case + if (flowBasedArguments.FLOW_Q_IS_KNOWN_END ? isAdapterClipped(rec) : isAdapterClippedWithQ(rec)) { + return unclippedCoor; + } + + // "uncertain quality clipped" case + if (!certain && isQualityClipped(rec)) { + return END_INSIGNIFICANT_VALUE; + } + } + + // if here, return a default + return unclippedCoor; + } + + public static boolean isAdapterClipped(final SAMRecord rec) { + return clippingTagContainsAny(rec, CLIPPING_TAG_CONTAINS_A); + } + + public static boolean isAdapterClippedWithQ(final SAMRecord rec) { + return clippingTagContainsAny(rec, CLIPPING_TAG_CONTAINS_AQ); + } + + public static boolean isQualityClipped(final SAMRecord rec) { + return clippingTagContainsAny(rec, CLIPPING_TAG_CONTAINS_QZ); + } + + private static boolean clippingTagContainsAny(final SAMRecord rec, final char[] chars) { + final String clippingTagValue = rec.getStringAttribute(CLIPPING_TAG_NAME); + + if ( clippingTagValue == null ) { + return false; + } else { + for ( final char ch : chars ) { + if ( clippingTagValue.indexOf(ch) >= 0 ) { + return true; + } + } + return false; + } + } + + /** + * private class used to represent use a SAMRecord's flow order, if such is present + */ + static private class FlowOrder { + + byte[] flowOrder; // the flow order byte string + int flowIndex = 0; // the current position on the flow order + + private FlowOrder(final SAMRecord rec) { + + // access flow order from record's read group + if ( rec.getReadGroup() != null && rec.getReadGroup().getFlowOrder() != null ) { + flowOrder = rec.getReadGroup().getFlowOrder().getBytes(); + return; + } + + // fallback on finding a flow order elsewhere + final SAMFileHeader header = rec.getHeader(); + for ( final SAMReadGroupRecord rg : header.getReadGroups() ) { + if (rg.getFlowOrder() != null) { + flowOrder = rg.getFlowOrder().getBytes(); + return; + } + } + + // otherwise, no flow order + flowOrder = null; + } + + private boolean isValid() { + return flowOrder != null; + } + + private void advance() { + if (++flowIndex >= flowOrder.length) { + flowIndex = 0; + } + } + + private byte current() { + return flowOrder[flowIndex]; + } + } +} diff --git a/src/main/java/picard/sam/markduplicates/MarkDuplicatesHelper.java b/src/main/java/picard/sam/markduplicates/MarkDuplicatesHelper.java new file mode 100644 index 0000000000..18462c452e --- /dev/null +++ b/src/main/java/picard/sam/markduplicates/MarkDuplicatesHelper.java @@ -0,0 +1,12 @@ +package picard.sam.markduplicates; + +import htsjdk.samtools.SAMFileHeader; +import htsjdk.samtools.SAMRecord; +import picard.sam.markduplicates.util.ReadEndsForMarkDuplicates; + +public interface MarkDuplicatesHelper { + + void generateDuplicateIndexes(final boolean useBarcodes, final boolean indexOpticalDuplicates); + ReadEndsForMarkDuplicates buildReadEnds(final SAMFileHeader header, final long index, final SAMRecord rec, final boolean useBarcodes); + short getReadDuplicateScore(final SAMRecord rec, final ReadEndsForMarkDuplicates pairedEnds); +} diff --git a/src/test/java/picard/sam/markduplicates/MarkDuplicatesForFlowHelperTest.java b/src/test/java/picard/sam/markduplicates/MarkDuplicatesForFlowHelperTest.java new file mode 100644 index 0000000000..cfddeacf0a --- /dev/null +++ b/src/test/java/picard/sam/markduplicates/MarkDuplicatesForFlowHelperTest.java @@ -0,0 +1,342 @@ +/* + * The MIT License + * + * Copyright (c) 2022 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +package picard.sam.markduplicates; + +import htsjdk.samtools.DuplicateScoringStrategy; +import htsjdk.samtools.SAMRecord; +import org.testng.Assert; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.io.File; + +/** + * This class defines the individual test cases to run. The actual running of the test is done + * by MarkDuplicatesTester (see getTester). + */ +public class MarkDuplicatesForFlowHelperTest { + protected static String TEST_BASE_NAME = null; + protected static File TEST_DATA_DIR = null; + final static String FLOW_ORDER = "TGCA"; + + + @BeforeClass + public void setUp() { + TEST_BASE_NAME = "MarkDuplicatesForFlowHelper"; + TEST_DATA_DIR = new File("testdata/picard/sam/MarkDuplicates"); + } + + protected AbstractMarkDuplicatesCommandLineProgramTester getTester() { + return new MarkDuplicatesTester(); + } + + private interface TesterModifier { + void modify(AbstractMarkDuplicatesCommandLineProgramTester tester); + } + + private static class TestRecordInfo { + final int length; + final int alignmentStart; + final String cigar; + final boolean isDuplicate; + final String startMod; + final String endMod; + + TestRecordInfo(final int length, final int alignmentStart, final String cigar, final boolean isDuplicate, + final String startMod, final String endMod) { + this.length = length; + this.alignmentStart = alignmentStart; + this.cigar = cigar; + this.isDuplicate = isDuplicate; + this.startMod = startMod; + this.endMod = endMod; + } + } + + @DataProvider(name ="forFlowDataProvider") + public Object[][] forFlowDataProvider() { + return new Object[][] { + // testUSE_END_IN_UNPAIRED_READS: End location is not significant + { + null, + new TestRecordInfo[] { + new TestRecordInfo(76, 12, null, false, null, null), + new TestRecordInfo(74, 12, null, true, null, null) + }, + new String[] { "USE_END_IN_UNPAIRED_READS=false" }, null + }, + + // testUSE_END_IN_UNPAIRED_READS: End location is significant + { + null, + new TestRecordInfo[] { + new TestRecordInfo(76, 12, null, false, null, null), + new TestRecordInfo(74, 12, null, false, null, null) + }, + new String[] { "USE_END_IN_UNPAIRED_READS=true" }, null + }, + + // testUSE_UNPAIRED_CLIPPED_END: Do not use clipped locations (meaning, use unclipped) + { + null, + new TestRecordInfo[] { + new TestRecordInfo(76, 12, "1S76M", false, null, null), + new TestRecordInfo(74, 12, "74M", false, null, null) + }, + new String[] { "USE_UNPAIRED_CLIPPED_END=false" }, null + }, + + // testUSE_UNPAIRED_CLIPPED_END: Use clipped locations (meaning, use clipped) + { + null, + new TestRecordInfo[] { + new TestRecordInfo(76, 12, "1S76M", false, null, null), + new TestRecordInfo(74, 12, "74M", true, null, null) + }, + new String[] { "USE_UNPAIRED_CLIPPED_END=true" }, null + }, + + // testUSE_UNPAIRED_CLIPPED_END: Use clipped locations (meaning, use clipped) + { + null, + new TestRecordInfo[] { + new TestRecordInfo(76, 12, "1S76M1S", true, null, null), + new TestRecordInfo(78, 11, "78M", false, null, null) + }, + new String[] { "USE_UNPAIRED_CLIPPED_END=false", "USE_END_IN_UNPAIRED_READS=true" }, null + }, + + // testUSE_UNPAIRED_CLIPPED_END: Use clipped locations (meaning, use clipped) + { + null, + new TestRecordInfo[] { + new TestRecordInfo(76, 12, "1S76M1S", false, null, null), + new TestRecordInfo(78, 11, "78M", false, null, null) + }, + new String[] { "USE_UNPAIRED_CLIPPED_END=true", "USE_END_IN_UNPAIRED_READS=true" }, null + }, + + // testFLOW_SKIP_FIRST_N_FLOWS: Do not use clipped locations (meaning, use unclipped) + { + null, + new TestRecordInfo[] { + new TestRecordInfo(76, 12,"76M", false, "ACGTT", "TTGCA"), + new TestRecordInfo(76, 12, "76M", true, "ACGGT", "TGGCA") + }, + new String[] { "USE_END_IN_UNPAIRED_READS=true", "FLOW_SKIP_FIRST_N_FLOWS=0" }, null + }, + + // testFLOW_SKIP_FIRST_N_FLOWS: Do not use clipped locations (meaning, use unclipped) + { + null, + new TestRecordInfo[] { + new TestRecordInfo(76, 12,"76M", false, "ACGTT", "TTGCA"), + new TestRecordInfo(76, 12, "76M", false, "CCGGT", "TGGCA") + }, + new String[] { "USE_END_IN_UNPAIRED_READS=true", "FLOW_SKIP_FIRST_N_FLOWS=3" }, null + }, + + // testFLOW_QUALITY_SUM_STRATEGY: normal sum + { + DuplicateScoringStrategy.ScoringStrategy.SUM_OF_BASE_QUALITIES, + new TestRecordInfo[] { + new TestRecordInfo(76, 12,"76M", true, "AAAC", null), + new TestRecordInfo(76, 12, "76M", false, "AACC", null) + }, + new String[] { "FLOW_QUALITY_SUM_STRATEGY=false" }, + new TesterModifier() { + @Override + public void modify(final AbstractMarkDuplicatesCommandLineProgramTester tester) { + final SAMRecord[] records = tester.getSamRecordSetBuilder().getRecords().toArray(new SAMRecord[0]); + records[0].setAttribute("tp", new int[76]); + records[1].setAttribute("tp", new int[76]); + records[0].getBaseQualities()[1] = 25; // dip inside AAA + } + } + }, + + // testFLOW_QUALITY_SUM_STRATEGY: flow (homopolymer based) sum + { + DuplicateScoringStrategy.ScoringStrategy.SUM_OF_BASE_QUALITIES, + new TestRecordInfo[] { + new TestRecordInfo(76, 12,"76M", false, "AAAC", null), + new TestRecordInfo(76, 12, "76M", true, "AACC", null) + }, + new String[] { "FLOW_QUALITY_SUM_STRATEGY=true" }, + new TesterModifier() { + @Override + public void modify(final AbstractMarkDuplicatesCommandLineProgramTester tester) { + final SAMRecord[] records = tester.getSamRecordSetBuilder().getRecords().toArray(new SAMRecord[0]); + records[0].setAttribute("tp", new int[76]); + records[1].setAttribute("tp", new int[76]); + records[0].getBaseQualities()[1] = 25; // dip inside AAA + } + } + }, + + // testUNPAIRED_END_UNCERTAINTY: End location is significant and uncertain, end sorted + { + null, + new TestRecordInfo[] { + new TestRecordInfo(74, 12, null, true, null, null), + new TestRecordInfo(84, 12, null, true, null, null), + new TestRecordInfo(94, 12, null, false, null, null) + }, + new String[] { "USE_END_IN_UNPAIRED_READS=true", "UNPAIRED_END_UNCERTAINTY=10" }, null + }, + + // testUNPAIRED_END_UNCERTAINTY: End location is significant and uncertain, end not sorted + { + null, + new TestRecordInfo[] { + new TestRecordInfo(174, 12, null, true, null, null), + new TestRecordInfo(194, 12, null, false, null, null), + new TestRecordInfo(184, 12, null, true, null, null) + }, + new String[] { "USE_END_IN_UNPAIRED_READS=true", "UNPAIRED_END_UNCERTAINTY=10" }, null + }, + + // testUNPAIRED_END_UNCERTAINTY: End location is significant and uncertain, end not sorted, multiple non-dup + { + null, + new TestRecordInfo[] { + new TestRecordInfo(174, 12, null, true, null, null), + new TestRecordInfo(294, 12, null, false, null, null), + new TestRecordInfo(184, 12, null, false, null, null) + }, + new String[] { "USE_END_IN_UNPAIRED_READS=true", "UNPAIRED_END_UNCERTAINTY=10" }, null + }, + // Barcode + { + null, + new TestRecordInfo[] { + new TestRecordInfo(76, 12, null, false, null, null), + new TestRecordInfo(74, 12, null, true, null, null) + }, + new String[] { "USE_END_IN_UNPAIRED_READS=false", "BARCODE_TAG=BC" }, + new TesterModifier() { + @Override + public void modify(final AbstractMarkDuplicatesCommandLineProgramTester tester) { + final SAMRecord[] records = tester.getSamRecordSetBuilder().getRecords().toArray(new SAMRecord[0]); + records[0].setAttribute("BC", "A"); + records[1].setAttribute("BC", "A"); + } + } + }, + // Barcode + { + null, + new TestRecordInfo[] { + new TestRecordInfo(76, 12, null, false, null, null), + new TestRecordInfo(74, 12, null, false, null, null) + }, + new String[] { "USE_END_IN_UNPAIRED_READS=false", "BARCODE_TAG=BC" }, + new TesterModifier() { + @Override + public void modify(final AbstractMarkDuplicatesCommandLineProgramTester tester) { + final SAMRecord[] records = tester.getSamRecordSetBuilder().getRecords().toArray(new SAMRecord[0]); + records[0].setAttribute("BC", "A"); + records[1].setAttribute("BC", "T"); + } + } + }, + }; + } + + @Test(dataProvider = "forFlowDataProvider") + public void testForFlow(final DuplicateScoringStrategy.ScoringStrategy scoringStrategy, final TestRecordInfo[] recInfos, final String[] params, TesterModifier modifier) { + + // get tester, build records + final AbstractMarkDuplicatesCommandLineProgramTester tester = + scoringStrategy == null ? getTester() : new MarkDuplicatesTester(scoringStrategy); + for ( final TestRecordInfo info : recInfos ) { + tester.getSamRecordSetBuilder().setReadLength(info.length); + if ( info.cigar != null ) { + tester.addMappedFragment(0, info.alignmentStart, info.isDuplicate, info.cigar, 50); + } else { + tester.addMappedFragment(0, info.alignmentStart, info.isDuplicate, 50); + } + } + + // modify records + final SAMRecord[] records = tester.getSamRecordSetBuilder().getRecords().toArray(new SAMRecord[0]); + for ( int i = 0 ; i < records.length ; i++ ) { + final SAMRecord rec = records[i]; + final TestRecordInfo info = recInfos[i]; + + if ( info.startMod != null ) { + System.arraycopy(info.startMod.getBytes(), 0, rec.getReadBases(), 0, info.startMod.length()); + } + if ( info.endMod != null ) { + System.arraycopy(info.endMod.getBytes(), 0, rec.getReadBases(), rec.getReadBases().length - info.endMod.length(), info.endMod.length()); + } + } + + // add parames, set flow order + tester.addArg("FLOW_MODE=true"); + for ( final String param : params ) { + tester.addArg(param); + } + tester.getSamRecordSetBuilder().getHeader().getReadGroups().get(0).setFlowOrder(FLOW_ORDER); + + // further modify tester + if ( modifier != null ) { + modifier.modify(tester); + } + + // run test + tester.runTest(); + } + + @DataProvider(name ="getFlowSumOfBaseQualitiesDataProvider") + public Object[][] getFlowSumOfBaseQualitiesDataProvider() { + return new Object[][] { + { "AAAA", new byte[] {50,50,50,50}, 30, 200 }, + { "AAAA", new byte[] {50,10,10,50}, 30, 200 }, + { "AAAA", new byte[] {20,10,10,20}, 30, 0 }, + { "AABBCC", new byte[] {50,50,10,10,40,40}, 30, 180 }, + }; + } + + @Test(dataProvider = "getFlowSumOfBaseQualitiesDataProvider") + public void testGetFlowSumOfBaseQualities(final String bases, final byte[] quals, final int threshold, final int expectedScore) { + + // build record + final AbstractMarkDuplicatesCommandLineProgramTester tester = getTester(); + tester.getSamRecordSetBuilder().setReadLength(bases.length()); + tester.addMappedFragment(0, 12, false, 50); + + // install bases and quals + final SAMRecord rec = tester.getSamRecordSetBuilder().getRecords().iterator().next(); + System.arraycopy(bases.getBytes(), 0, rec.getReadBases(), 0,bases.length()); + System.arraycopy(quals, 0, rec.getBaseQualities(), 0, quals.length); + + // calculate score + final int score = MarkDuplicatesForFlowHelper.getFlowSumOfBaseQualities(rec, threshold); + Assert.assertEquals(score, expectedScore); + } + +} diff --git a/testdata/picard/sam/CollectGcBiasMetrics/CollectGcBias11612750771983880335.bam b/testdata/picard/sam/CollectGcBiasMetrics/CollectGcBias11612750771983880335.bam new file mode 100644 index 0000000000..00dba076d8 Binary files /dev/null and b/testdata/picard/sam/CollectGcBiasMetrics/CollectGcBias11612750771983880335.bam differ diff --git a/testdata/picard/sam/CollectGcBiasMetrics/CollectGcBias6837461464900918747.bam b/testdata/picard/sam/CollectGcBiasMetrics/CollectGcBias6837461464900918747.bam new file mode 100644 index 0000000000..b3954671a9 Binary files /dev/null and b/testdata/picard/sam/CollectGcBiasMetrics/CollectGcBias6837461464900918747.bam differ diff --git a/testdata/picard/sam/CollectQualityYieldMetrics/collect_quality_yield_test1.sam b/testdata/picard/sam/CollectQualityYieldMetrics/collect_quality_yield_test1.sam new file mode 100644 index 0000000000..02352f82a7 --- /dev/null +++ b/testdata/picard/sam/CollectQualityYieldMetrics/collect_quality_yield_test1.sam @@ -0,0 +1,6 @@ +@HD VN:1.0 SO:coordinate +@SQ SN:chr1 LN:101 UR:file:testdata/net/sf/picard/sam/merger.fasta M5:bd01f7e11515bb6beda8f7257902aa67 +@RG ID:0 SM:Hi,Momma! LB:whatever PU:me PL:ILLUMINA +READ_1 0 chr1 1 255 10M * 0 0 TTTTTTTTTT :::::::::: RG:Z:0 XN:i:1 +READ_2 512 chr1 1 255 10M * 0 0 TTTTTTTTTT :::::::::: RG:Z:0 XN:i:1 +READ_3 0 chr1 1 255 10M * 0 0 TTTTTTTTTT <<<<