-
Notifications
You must be signed in to change notification settings - Fork 371
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
97854c7
commit 50b004b
Showing
12 changed files
with
1,147 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,231 @@ | ||
/* | ||
* The MIT License | ||
* | ||
* Copyright (c) 2022 The Broad Institute | ||
* | ||
* Permission is hereby granted, free of charge, to any person obtaining a copy | ||
* of this software and associated documentation files (the "Software"), to deal | ||
* in the Software without restriction, including without limitation the rights | ||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | ||
* copies of the Software, and to permit persons to whom the Software is | ||
* furnished to do so, subject to the following conditions: | ||
* | ||
* The above copyright notice and this permission notice shall be included in | ||
* all copies or substantial portions of the Software. | ||
* | ||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | ||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | ||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN | ||
* THE SOFTWARE. | ||
*/ | ||
|
||
package picard.analysis; | ||
|
||
import htsjdk.samtools.SAMRecord; | ||
import htsjdk.samtools.util.Histogram; | ||
import htsjdk.samtools.util.QualityUtil; | ||
|
||
import java.util.ArrayList; | ||
import java.util.Arrays; | ||
import java.util.List; | ||
|
||
final class HistogramGenerator { | ||
final boolean useOriginalQualities; | ||
int maxLengthSoFar = 0; | ||
double[] firstReadTotalsByCycle = new double[maxLengthSoFar]; | ||
double[] firstReadTotalProbsByCycle = new double[maxLengthSoFar]; | ||
long[] firstReadCountsByCycle = new long[maxLengthSoFar]; | ||
double[] secondReadTotalsByCycle = new double[maxLengthSoFar]; | ||
double[] secondReadTotalProbsByCycle = new double[maxLengthSoFar]; | ||
long[] secondReadCountsByCycle = new long[maxLengthSoFar]; | ||
int recordsCount = 0; | ||
final public int skipBases = 10; //qualities of the first bases are ignored (inference issues) | ||
final public int minimalCount = 25; | ||
|
||
public HistogramGenerator(final boolean useOriginalQualities) { | ||
this.useOriginalQualities = useOriginalQualities; | ||
} | ||
|
||
public HistogramGenerator(final double[] firstReadTotalsByCycle, final double[] firstReadTotalProbsByCycle, | ||
final long[] firstReadCountsByCycle, final double[] secondReadTotalsByCycle, | ||
final double[] secondReadTotalProbsByCycle, final long[] secondReadCountsByCycle, int nRecords){ | ||
this.firstReadCountsByCycle = firstReadCountsByCycle.clone(); | ||
this.firstReadTotalsByCycle = firstReadTotalsByCycle.clone(); | ||
this.firstReadTotalProbsByCycle = firstReadTotalProbsByCycle.clone(); | ||
this.secondReadCountsByCycle = secondReadCountsByCycle.clone(); | ||
this.secondReadTotalsByCycle = secondReadTotalsByCycle.clone(); | ||
this.secondReadTotalProbsByCycle = secondReadTotalProbsByCycle.clone(); | ||
this.useOriginalQualities = false; | ||
this.recordsCount = nRecords; | ||
} | ||
|
||
public void addRecord(final SAMRecord rec) { | ||
final byte[] quals = (useOriginalQualities ? rec.getOriginalBaseQualities() : rec.getBaseQualities()); | ||
if (quals == null) return; | ||
recordsCount++; | ||
final int length = quals.length; | ||
final boolean rc = rec.getReadNegativeStrandFlag(); | ||
ensureArraysBigEnough(length+1); | ||
|
||
for (int i=0; i<length; ++i) { | ||
final int cycle = rc ? length-i : i+1; | ||
|
||
if (rec.getReadPairedFlag() && rec.getSecondOfPairFlag()) { | ||
secondReadTotalsByCycle[cycle] += quals[i]; | ||
secondReadTotalProbsByCycle[cycle] += QualityUtil.getErrorProbabilityFromPhredScore(quals[i]); | ||
secondReadCountsByCycle[cycle] += 1; | ||
} | ||
else { | ||
firstReadTotalsByCycle[cycle] += quals[i]; | ||
firstReadTotalProbsByCycle[cycle] += QualityUtil.getErrorProbabilityFromPhredScore(quals[i]); | ||
firstReadCountsByCycle[cycle] += 1; | ||
} | ||
} | ||
} | ||
|
||
/** | ||
* Used to merge two histogram generators together | ||
* @param other | ||
*/ | ||
public void addOtherHistogramGenerator(final HistogramGenerator other){ | ||
if (other!=null){ | ||
ensureArraysBigEnough(other.maxLengthSoFar); | ||
for (int i = 0; i < maxLengthSoFar; i++){ | ||
firstReadCountsByCycle[i] += other.firstReadCountsByCycle[i]; | ||
secondReadCountsByCycle[i] += other.secondReadCountsByCycle[i]; | ||
firstReadTotalsByCycle[i] += other.firstReadTotalsByCycle[i]; | ||
secondReadTotalsByCycle[i] += other.secondReadTotalsByCycle[i]; | ||
firstReadTotalProbsByCycle[i] += other.firstReadTotalProbsByCycle[i]; | ||
secondReadTotalProbsByCycle[i] += other.secondReadTotalProbsByCycle[i]; | ||
} | ||
recordsCount += other.recordsCount; | ||
} | ||
|
||
} | ||
|
||
public int calculateLQ(final int threshold, int readInPair, int spanningWindowLength){ | ||
final double errorProbThreshold = QualityUtil.getErrorProbabilityFromPhredScore(threshold); | ||
List<Double> result = new ArrayList<>(); | ||
List<Long> weights = new ArrayList<>(); | ||
double[] accumulator; | ||
long[] counts; | ||
if (readInPair == 1){ | ||
accumulator = firstReadTotalProbsByCycle; | ||
counts = firstReadCountsByCycle; | ||
} else { | ||
accumulator = secondReadTotalProbsByCycle; | ||
counts = secondReadCountsByCycle; | ||
} | ||
for (int i = skipBases; i < accumulator.length; i++ ){ | ||
if (counts[i] < minimalCount){ | ||
break; | ||
} | ||
result.add(accumulator[i]/counts[i]); | ||
weights.add(counts[i]); | ||
} | ||
applySpanningWindowMean(result, weights, spanningWindowLength); | ||
return longestHighQuality(result,errorProbThreshold); | ||
} | ||
|
||
private void applySpanningWindowMean(List<Double> vector, List<Long> weights, final int spanLength){ | ||
List<Double> tmp = new ArrayList<>(vector); | ||
for (int i = 0; i < vector.size(); i++){ | ||
double tmpEr = 0; | ||
long tmpWeight = 0; | ||
for (int j = Math.max(i-spanLength,0); j < Math.min(i+spanLength+1, vector.size()); j++){ | ||
tmpEr += tmp.get(j)*weights.get(j); | ||
tmpWeight += weights.get(j); | ||
} | ||
vector.set(i, tmpEr/tmpWeight); | ||
} | ||
} | ||
|
||
private int longestHighQuality(List<Double> averageErrorProbabilities, double errorProbThreshold){ | ||
int curStart = 0; | ||
int curEnd = 0; | ||
int curBestIntervalLength = 0; | ||
|
||
while ( curEnd < averageErrorProbabilities.size() ) { | ||
if (averageErrorProbabilities.get(curEnd) <= errorProbThreshold) { | ||
curEnd++; | ||
} else { | ||
if ((curEnd - curStart) > curBestIntervalLength) { | ||
curBestIntervalLength = curEnd - curStart; | ||
} | ||
curStart = curEnd + 1; | ||
curEnd = curStart; | ||
} | ||
} | ||
if ((curEnd - curStart) > curBestIntervalLength) { | ||
curBestIntervalLength = curEnd - curStart; | ||
} | ||
return curBestIntervalLength; | ||
} | ||
|
||
boolean isEmpty() { | ||
return maxLengthSoFar == 0; | ||
} | ||
|
||
|
||
private void ensureArraysBigEnough(final int length) { | ||
if (length > maxLengthSoFar) { | ||
firstReadTotalsByCycle = Arrays.copyOf(firstReadTotalsByCycle, length); | ||
firstReadTotalProbsByCycle = Arrays.copyOf(firstReadTotalProbsByCycle, length); | ||
firstReadCountsByCycle = Arrays.copyOf(firstReadCountsByCycle, length); | ||
secondReadTotalsByCycle = Arrays.copyOf(secondReadTotalsByCycle , length); | ||
secondReadTotalProbsByCycle = Arrays.copyOf(secondReadTotalProbsByCycle , length); | ||
secondReadCountsByCycle = Arrays.copyOf(secondReadCountsByCycle, length); | ||
maxLengthSoFar = length; | ||
} | ||
} | ||
|
||
Histogram<Integer> getMeanQualityHistogram() { | ||
final String label = useOriginalQualities ? "MEAN_ORIGINAL_QUALITY" : "MEAN_QUALITY"; | ||
final Histogram<Integer> meanQualities = new Histogram<Integer>("CYCLE", label); | ||
|
||
int firstReadLength = 0; | ||
|
||
for (int cycle=0; cycle < firstReadTotalsByCycle.length; ++cycle) { | ||
if (firstReadTotalsByCycle[cycle] > 0) { | ||
meanQualities.increment(cycle, firstReadTotalsByCycle[cycle] / firstReadCountsByCycle[cycle]); | ||
firstReadLength = cycle; | ||
} | ||
} | ||
|
||
for (int i=0; i< secondReadTotalsByCycle.length; ++i) { | ||
if (secondReadCountsByCycle[i] > 0) { | ||
final int cycle = firstReadLength + i; | ||
meanQualities.increment(cycle, secondReadTotalsByCycle[i] / secondReadCountsByCycle[i]); | ||
} | ||
} | ||
|
||
return meanQualities; | ||
} | ||
|
||
Histogram<Integer> getMeanErrorProbHistogram() { | ||
final String label = useOriginalQualities ? "MEAN_ORIGINAL_ERROR_PROB" : "MEAN_ERROR_PROB"; | ||
final Histogram<Integer> meanQualities = new Histogram<Integer>("CYCLE", label); | ||
|
||
int firstReadLength = 0; | ||
|
||
for (int cycle=0; cycle < firstReadTotalsByCycle.length; ++cycle) { | ||
if (firstReadTotalsByCycle[cycle] > 0) { | ||
meanQualities.increment(cycle, firstReadTotalProbsByCycle[cycle] / firstReadCountsByCycle[cycle]); | ||
firstReadLength = cycle; | ||
} | ||
} | ||
|
||
for (int i=0; i< secondReadTotalsByCycle.length; ++i) { | ||
if (secondReadCountsByCycle[i] > 0) { | ||
final int cycle = firstReadLength + i; | ||
meanQualities.increment(cycle, secondReadTotalProbsByCycle[i] / secondReadCountsByCycle[i]); | ||
} | ||
} | ||
|
||
return meanQualities; | ||
} | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
package picard.sam; | ||
|
||
public class DuplicationMetricsFactory { | ||
|
||
// create a DuplicationMetrics for a specific read group | ||
public static DuplicationMetrics createMetrics(final boolean flowMetrics) { | ||
|
||
// create based on the presence of flow order | ||
if ( !flowMetrics ) { | ||
return new DuplicationMetrics(); | ||
} else { | ||
return new FlowBasedDuplicationMetrics(); | ||
} | ||
} | ||
|
||
public static DuplicationMetrics createMetrics() { | ||
return new DuplicationMetrics(); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,74 @@ | ||
package picard.sam; | ||
|
||
import htsjdk.samtools.SAMRecord; | ||
import picard.sam.markduplicates.util.AbstractMarkDuplicatesCommandLineProgram; | ||
import picard.util.MathUtil; | ||
|
||
public class FlowBasedDuplicationMetrics extends DuplicationMetrics { | ||
|
||
/* | ||
* count of single end reads where the exact fragment length is known (i.e. clipped) | ||
*/ | ||
@MergeByAdding | ||
public long UNPAIRED_WITH_TLEN; | ||
|
||
/* | ||
* count of single end duplicates where the exact fragment length is | ||
* unknown (quality trimmed, not clipped) | ||
*/ | ||
@MergeByAdding | ||
public long UNPAIRED_DUPS_WITHOUT_TLEN; | ||
|
||
/* | ||
* count of duplicates where both ends are known | ||
*/ | ||
@MergeByAdding | ||
public long UNPAIRED_DUPS_WITH_TLEN; | ||
|
||
/** | ||
* The fraction of duplicated reads out of all reads with exact | ||
* fragment length unknown | ||
*/ | ||
@NoMergingIsDerived | ||
public Double UNPAIRED_DUP_RATE_WITHOUT_TLEN; | ||
|
||
/** | ||
* The fraction of duplicated reads out of all reads with exact fragment | ||
* length known | ||
*/ | ||
@NoMergingIsDerived | ||
public Double UNPAIRED_DUP_RATE_WITH_TLEN; | ||
|
||
|
||
@Override | ||
public void calculateDerivedFields() { | ||
super.calculateDerivedFields(); | ||
|
||
UNPAIRED_DUP_RATE_WITHOUT_TLEN = MathUtil.divide(UNPAIRED_DUPS_WITHOUT_TLEN, UNPAIRED_READS_EXAMINED - UNPAIRED_WITH_TLEN); | ||
UNPAIRED_DUP_RATE_WITH_TLEN = MathUtil.divide(UNPAIRED_DUPS_WITH_TLEN, UNPAIRED_WITH_TLEN); | ||
} | ||
|
||
public void addDuplicateReadToMetrics(final SAMRecord rec) { | ||
super.addDuplicateReadToMetrics(rec); | ||
|
||
if (!rec.isSecondaryOrSupplementary() && !rec.getReadUnmappedFlag()) { | ||
if (!rec.getReadPairedFlag() || rec.getMateUnmappedFlag()) { | ||
if ( AbstractMarkDuplicatesCommandLineProgram.isSingleEndReadKnownFragment(rec) ) { | ||
++UNPAIRED_DUPS_WITH_TLEN; | ||
} else { | ||
++UNPAIRED_DUPS_WITHOUT_TLEN; | ||
} | ||
} | ||
} | ||
} | ||
|
||
public void addReadToLibraryMetrics(final SAMRecord rec) { | ||
|
||
super.addReadToLibraryMetrics(rec); | ||
|
||
if (AbstractMarkDuplicatesCommandLineProgram.isSingleEndReadKnownFragment(rec)) { | ||
++UNPAIRED_WITH_TLEN; | ||
} | ||
} | ||
|
||
} |
41 changes: 41 additions & 0 deletions
41
src/main/java/picard/sam/markduplicates/MarkDuplicatesForFlowArgumentCollection.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
package picard.sam.markduplicates; | ||
|
||
import org.broadinstitute.barclay.argparser.Argument; | ||
|
||
public class MarkDuplicatesForFlowArgumentCollection { | ||
|
||
@Argument(doc = "enable parameters and behavior specific to flow based reads.", optional = true) | ||
public boolean FLOW_MODE = false; | ||
|
||
@Argument(doc = "Use specific quality summing strategy for flow based reads. The strategy ensures that the same " + | ||
"(and correct) quality value is used for all bases of the same homopolymer.", optional = true) | ||
public boolean FLOW_QUALITY_SUM_STRATEGY = false; | ||
|
||
@Argument(doc = "Make the end location of single end read be significant when considering duplicates, " + | ||
"in addition to the start location, which is always significant (i.e. require single-ended reads to start and" + | ||
"end on the same position to be considered duplicate) " + | ||
"(for this argument, \"read end\" means 3' end).", optional = true) | ||
public boolean USE_END_IN_UNPAIRED_READS = false; | ||
|
||
@Argument(doc = "Use position of the clipping as the end position, when considering duplicates (or use the unclipped end position) " + | ||
"(for this argument, \"read end\" means 3' end).", optional = true) | ||
public boolean USE_UNPAIRED_CLIPPED_END = false; | ||
|
||
@Argument(doc = "Maximal difference of the read end position that counted as equal. Useful for flow based " + | ||
"reads where the end position might vary due to sequencing errors. " + | ||
"(for this argument, \"read end\" means 3' end)", optional = true) | ||
public int UNPAIRED_END_UNCERTAINTY = 0; | ||
|
||
@Argument(doc = "Skip first N flows, starting from the read's start, when considering duplicates. Useful for flow based reads where sometimes there " + | ||
"is noise in the first flows " + | ||
"(for this argument, \"read start\" means 5' end).", optional = true) | ||
public int FLOW_SKIP_FIRST_N_FLOWS = 0; | ||
|
||
@Argument(doc = "Treat position of read trimming based on quality as the known end (relevant for flow based reads). Default false - if the read " + | ||
"is trimmed on quality its end is not defined and the read is duplicate of any read starting at the same place.", optional = true) | ||
public boolean FLOW_Q_IS_KNOWN_END = false; | ||
|
||
@Argument(doc = "Threshold for considering a quality value high enough to be included when calculating FLOW_QUALITY_SUM_STRATEGY calculation.", optional = true) | ||
public int FLOW_EFFECTIVE_QUALITY_THRESHOLD = 15; | ||
|
||
} |
Oops, something went wrong.