Skip to content

Commit

Permalink
Added rebase fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
ilyasoifer committed Apr 4, 2022
1 parent 0bb0a47 commit bbbc6ed
Show file tree
Hide file tree
Showing 12 changed files with 1,147 additions and 0 deletions.
231 changes: 231 additions & 0 deletions src/main/java/picard/analysis/HistogramGenerator.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,231 @@
/*
* The MIT License
*
* Copyright (c) 2022 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/

package picard.analysis;

import htsjdk.samtools.SAMRecord;
import htsjdk.samtools.util.Histogram;
import htsjdk.samtools.util.QualityUtil;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;

final class HistogramGenerator {
final boolean useOriginalQualities;
int maxLengthSoFar = 0;
double[] firstReadTotalsByCycle = new double[maxLengthSoFar];
double[] firstReadTotalProbsByCycle = new double[maxLengthSoFar];
long[] firstReadCountsByCycle = new long[maxLengthSoFar];
double[] secondReadTotalsByCycle = new double[maxLengthSoFar];
double[] secondReadTotalProbsByCycle = new double[maxLengthSoFar];
long[] secondReadCountsByCycle = new long[maxLengthSoFar];
int recordsCount = 0;
final public int skipBases = 10; //qualities of the first bases are ignored (inference issues)
final public int minimalCount = 25;

public HistogramGenerator(final boolean useOriginalQualities) {
this.useOriginalQualities = useOriginalQualities;
}

public HistogramGenerator(final double[] firstReadTotalsByCycle, final double[] firstReadTotalProbsByCycle,
final long[] firstReadCountsByCycle, final double[] secondReadTotalsByCycle,
final double[] secondReadTotalProbsByCycle, final long[] secondReadCountsByCycle, int nRecords){
this.firstReadCountsByCycle = firstReadCountsByCycle.clone();
this.firstReadTotalsByCycle = firstReadTotalsByCycle.clone();
this.firstReadTotalProbsByCycle = firstReadTotalProbsByCycle.clone();
this.secondReadCountsByCycle = secondReadCountsByCycle.clone();
this.secondReadTotalsByCycle = secondReadTotalsByCycle.clone();
this.secondReadTotalProbsByCycle = secondReadTotalProbsByCycle.clone();
this.useOriginalQualities = false;
this.recordsCount = nRecords;
}

public void addRecord(final SAMRecord rec) {
final byte[] quals = (useOriginalQualities ? rec.getOriginalBaseQualities() : rec.getBaseQualities());
if (quals == null) return;
recordsCount++;
final int length = quals.length;
final boolean rc = rec.getReadNegativeStrandFlag();
ensureArraysBigEnough(length+1);

for (int i=0; i<length; ++i) {
final int cycle = rc ? length-i : i+1;

if (rec.getReadPairedFlag() && rec.getSecondOfPairFlag()) {
secondReadTotalsByCycle[cycle] += quals[i];
secondReadTotalProbsByCycle[cycle] += QualityUtil.getErrorProbabilityFromPhredScore(quals[i]);
secondReadCountsByCycle[cycle] += 1;
}
else {
firstReadTotalsByCycle[cycle] += quals[i];
firstReadTotalProbsByCycle[cycle] += QualityUtil.getErrorProbabilityFromPhredScore(quals[i]);
firstReadCountsByCycle[cycle] += 1;
}
}
}

/**
* Used to merge two histogram generators together
* @param other
*/
public void addOtherHistogramGenerator(final HistogramGenerator other){
if (other!=null){
ensureArraysBigEnough(other.maxLengthSoFar);
for (int i = 0; i < maxLengthSoFar; i++){
firstReadCountsByCycle[i] += other.firstReadCountsByCycle[i];
secondReadCountsByCycle[i] += other.secondReadCountsByCycle[i];
firstReadTotalsByCycle[i] += other.firstReadTotalsByCycle[i];
secondReadTotalsByCycle[i] += other.secondReadTotalsByCycle[i];
firstReadTotalProbsByCycle[i] += other.firstReadTotalProbsByCycle[i];
secondReadTotalProbsByCycle[i] += other.secondReadTotalProbsByCycle[i];
}
recordsCount += other.recordsCount;
}

}

public int calculateLQ(final int threshold, int readInPair, int spanningWindowLength){
final double errorProbThreshold = QualityUtil.getErrorProbabilityFromPhredScore(threshold);
List<Double> result = new ArrayList<>();
List<Long> weights = new ArrayList<>();
double[] accumulator;
long[] counts;
if (readInPair == 1){
accumulator = firstReadTotalProbsByCycle;
counts = firstReadCountsByCycle;
} else {
accumulator = secondReadTotalProbsByCycle;
counts = secondReadCountsByCycle;
}
for (int i = skipBases; i < accumulator.length; i++ ){
if (counts[i] < minimalCount){
break;
}
result.add(accumulator[i]/counts[i]);
weights.add(counts[i]);
}
applySpanningWindowMean(result, weights, spanningWindowLength);
return longestHighQuality(result,errorProbThreshold);
}

private void applySpanningWindowMean(List<Double> vector, List<Long> weights, final int spanLength){
List<Double> tmp = new ArrayList<>(vector);
for (int i = 0; i < vector.size(); i++){
double tmpEr = 0;
long tmpWeight = 0;
for (int j = Math.max(i-spanLength,0); j < Math.min(i+spanLength+1, vector.size()); j++){
tmpEr += tmp.get(j)*weights.get(j);
tmpWeight += weights.get(j);
}
vector.set(i, tmpEr/tmpWeight);
}
}

private int longestHighQuality(List<Double> averageErrorProbabilities, double errorProbThreshold){
int curStart = 0;
int curEnd = 0;
int curBestIntervalLength = 0;

while ( curEnd < averageErrorProbabilities.size() ) {
if (averageErrorProbabilities.get(curEnd) <= errorProbThreshold) {
curEnd++;
} else {
if ((curEnd - curStart) > curBestIntervalLength) {
curBestIntervalLength = curEnd - curStart;
}
curStart = curEnd + 1;
curEnd = curStart;
}
}
if ((curEnd - curStart) > curBestIntervalLength) {
curBestIntervalLength = curEnd - curStart;
}
return curBestIntervalLength;
}

boolean isEmpty() {
return maxLengthSoFar == 0;
}


private void ensureArraysBigEnough(final int length) {
if (length > maxLengthSoFar) {
firstReadTotalsByCycle = Arrays.copyOf(firstReadTotalsByCycle, length);
firstReadTotalProbsByCycle = Arrays.copyOf(firstReadTotalProbsByCycle, length);
firstReadCountsByCycle = Arrays.copyOf(firstReadCountsByCycle, length);
secondReadTotalsByCycle = Arrays.copyOf(secondReadTotalsByCycle , length);
secondReadTotalProbsByCycle = Arrays.copyOf(secondReadTotalProbsByCycle , length);
secondReadCountsByCycle = Arrays.copyOf(secondReadCountsByCycle, length);
maxLengthSoFar = length;
}
}

Histogram<Integer> getMeanQualityHistogram() {
final String label = useOriginalQualities ? "MEAN_ORIGINAL_QUALITY" : "MEAN_QUALITY";
final Histogram<Integer> meanQualities = new Histogram<Integer>("CYCLE", label);

int firstReadLength = 0;

for (int cycle=0; cycle < firstReadTotalsByCycle.length; ++cycle) {
if (firstReadTotalsByCycle[cycle] > 0) {
meanQualities.increment(cycle, firstReadTotalsByCycle[cycle] / firstReadCountsByCycle[cycle]);
firstReadLength = cycle;
}
}

for (int i=0; i< secondReadTotalsByCycle.length; ++i) {
if (secondReadCountsByCycle[i] > 0) {
final int cycle = firstReadLength + i;
meanQualities.increment(cycle, secondReadTotalsByCycle[i] / secondReadCountsByCycle[i]);
}
}

return meanQualities;
}

Histogram<Integer> getMeanErrorProbHistogram() {
final String label = useOriginalQualities ? "MEAN_ORIGINAL_ERROR_PROB" : "MEAN_ERROR_PROB";
final Histogram<Integer> meanQualities = new Histogram<Integer>("CYCLE", label);

int firstReadLength = 0;

for (int cycle=0; cycle < firstReadTotalsByCycle.length; ++cycle) {
if (firstReadTotalsByCycle[cycle] > 0) {
meanQualities.increment(cycle, firstReadTotalProbsByCycle[cycle] / firstReadCountsByCycle[cycle]);
firstReadLength = cycle;
}
}

for (int i=0; i< secondReadTotalsByCycle.length; ++i) {
if (secondReadCountsByCycle[i] > 0) {
final int cycle = firstReadLength + i;
meanQualities.increment(cycle, secondReadTotalProbsByCycle[i] / secondReadCountsByCycle[i]);
}
}

return meanQualities;
}

}
19 changes: 19 additions & 0 deletions src/main/java/picard/sam/DuplicationMetricsFactory.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
package picard.sam;

public class DuplicationMetricsFactory {

// create a DuplicationMetrics for a specific read group
public static DuplicationMetrics createMetrics(final boolean flowMetrics) {

// create based on the presence of flow order
if ( !flowMetrics ) {
return new DuplicationMetrics();
} else {
return new FlowBasedDuplicationMetrics();
}
}

public static DuplicationMetrics createMetrics() {
return new DuplicationMetrics();
}
}
74 changes: 74 additions & 0 deletions src/main/java/picard/sam/FlowBasedDuplicationMetrics.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
package picard.sam;

import htsjdk.samtools.SAMRecord;
import picard.sam.markduplicates.util.AbstractMarkDuplicatesCommandLineProgram;
import picard.util.MathUtil;

public class FlowBasedDuplicationMetrics extends DuplicationMetrics {

/*
* count of single end reads where the exact fragment length is known (i.e. clipped)
*/
@MergeByAdding
public long UNPAIRED_WITH_TLEN;

/*
* count of single end duplicates where the exact fragment length is
* unknown (quality trimmed, not clipped)
*/
@MergeByAdding
public long UNPAIRED_DUPS_WITHOUT_TLEN;

/*
* count of duplicates where both ends are known
*/
@MergeByAdding
public long UNPAIRED_DUPS_WITH_TLEN;

/**
* The fraction of duplicated reads out of all reads with exact
* fragment length unknown
*/
@NoMergingIsDerived
public Double UNPAIRED_DUP_RATE_WITHOUT_TLEN;

/**
* The fraction of duplicated reads out of all reads with exact fragment
* length known
*/
@NoMergingIsDerived
public Double UNPAIRED_DUP_RATE_WITH_TLEN;


@Override
public void calculateDerivedFields() {
super.calculateDerivedFields();

UNPAIRED_DUP_RATE_WITHOUT_TLEN = MathUtil.divide(UNPAIRED_DUPS_WITHOUT_TLEN, UNPAIRED_READS_EXAMINED - UNPAIRED_WITH_TLEN);
UNPAIRED_DUP_RATE_WITH_TLEN = MathUtil.divide(UNPAIRED_DUPS_WITH_TLEN, UNPAIRED_WITH_TLEN);
}

public void addDuplicateReadToMetrics(final SAMRecord rec) {
super.addDuplicateReadToMetrics(rec);

if (!rec.isSecondaryOrSupplementary() && !rec.getReadUnmappedFlag()) {
if (!rec.getReadPairedFlag() || rec.getMateUnmappedFlag()) {
if ( AbstractMarkDuplicatesCommandLineProgram.isSingleEndReadKnownFragment(rec) ) {
++UNPAIRED_DUPS_WITH_TLEN;
} else {
++UNPAIRED_DUPS_WITHOUT_TLEN;
}
}
}
}

public void addReadToLibraryMetrics(final SAMRecord rec) {

super.addReadToLibraryMetrics(rec);

if (AbstractMarkDuplicatesCommandLineProgram.isSingleEndReadKnownFragment(rec)) {
++UNPAIRED_WITH_TLEN;
}
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
package picard.sam.markduplicates;

import org.broadinstitute.barclay.argparser.Argument;

public class MarkDuplicatesForFlowArgumentCollection {

@Argument(doc = "enable parameters and behavior specific to flow based reads.", optional = true)
public boolean FLOW_MODE = false;

@Argument(doc = "Use specific quality summing strategy for flow based reads. The strategy ensures that the same " +
"(and correct) quality value is used for all bases of the same homopolymer.", optional = true)
public boolean FLOW_QUALITY_SUM_STRATEGY = false;

@Argument(doc = "Make the end location of single end read be significant when considering duplicates, " +
"in addition to the start location, which is always significant (i.e. require single-ended reads to start and" +
"end on the same position to be considered duplicate) " +
"(for this argument, \"read end\" means 3' end).", optional = true)
public boolean USE_END_IN_UNPAIRED_READS = false;

@Argument(doc = "Use position of the clipping as the end position, when considering duplicates (or use the unclipped end position) " +
"(for this argument, \"read end\" means 3' end).", optional = true)
public boolean USE_UNPAIRED_CLIPPED_END = false;

@Argument(doc = "Maximal difference of the read end position that counted as equal. Useful for flow based " +
"reads where the end position might vary due to sequencing errors. " +
"(for this argument, \"read end\" means 3' end)", optional = true)
public int UNPAIRED_END_UNCERTAINTY = 0;

@Argument(doc = "Skip first N flows, starting from the read's start, when considering duplicates. Useful for flow based reads where sometimes there " +
"is noise in the first flows " +
"(for this argument, \"read start\" means 5' end).", optional = true)
public int FLOW_SKIP_FIRST_N_FLOWS = 0;

@Argument(doc = "Treat position of read trimming based on quality as the known end (relevant for flow based reads). Default false - if the read " +
"is trimmed on quality its end is not defined and the read is duplicate of any read starting at the same place.", optional = true)
public boolean FLOW_Q_IS_KNOWN_END = false;

@Argument(doc = "Threshold for considering a quality value high enough to be included when calculating FLOW_QUALITY_SUM_STRATEGY calculation.", optional = true)
public int FLOW_EFFECTIVE_QUALITY_THRESHOLD = 15;

}
Loading

0 comments on commit bbbc6ed

Please # to comment.