Skip to content

Commit

Permalink
Merge pull request #84 from Illumina/perm-improv
Browse files Browse the repository at this point in the history
Performance improvement by opening bam/cram file only once
  • Loading branch information
yjqiu authored Nov 15, 2019
2 parents 40b4a77 + fd21ada commit 380e302
Show file tree
Hide file tree
Showing 7 changed files with 97 additions and 10 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@ cmake-build-debug
test-example
build
.vscode
build.sh
20 changes: 10 additions & 10 deletions sample_analysis/HtsSeekingSampleAnalysis.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -64,10 +64,8 @@ static bool checkIfMatesWereMappedNearby(const MappedRead& read)
return (read.contigIndex() == read.mateContigIndex()) && (std::abs(read.pos() - read.matePos()) < kMaxDistance);
}

static void recoverMates(const string& htsFilePath, const string& htsReferencePath, ReadPairs& readPairs)
static void recoverMates(htshelpers::MateExtractor& mateExtractor, ReadPairs& readPairs)
{
htshelpers::MateExtractor mateExtractor(htsFilePath, htsReferencePath);

for (auto& fragmentIdAndReadPair : readPairs)
{
ReadPair& readPair = fragmentIdAndReadPair.second;
Expand Down Expand Up @@ -116,10 +114,9 @@ static int getReadCountCap(const vector<GenomicRegion>& regionsWithReads)
return readCountCap;
}

static ReadPairs
collectReads(const vector<GenomicRegion>& regionsWithReads, const string& htsFilePath, const string& htsReferencePath)
static ReadPairs collectReads(
const vector<GenomicRegion>& regionsWithReads, HtsFileSeeker& htsFileSeeker, htshelpers::MateExtractor& mateExtractor)
{
HtsFileSeeker htsFileSeeker(htsFilePath, htsReferencePath);
ReadPairs readPairs;

for (const auto& regionWithReads : regionsWithReads)
Expand Down Expand Up @@ -150,7 +147,7 @@ collectReads(const vector<GenomicRegion>& regionsWithReads, const string& htsFil
}

const int numReadsBeforeRecovery = readPairs.NumReads();
recoverMates(htsFilePath, htsReferencePath, readPairs);
recoverMates(mateExtractor, readPairs);
const int numReadsAfterRecovery = readPairs.NumReads() - numReadsBeforeRecovery;
spdlog::debug("Recovered {} reads", numReadsAfterRecovery);

Expand All @@ -161,11 +158,14 @@ SampleFindings htsSeekingSampleAnalysis(
const InputPaths& inputPaths, Sex sampleSex, const LocusCatalog& regionCatalog,
const vector<RegionInfo>& normRegionInfo, BamletWriterPtr bamletWriter)
{
CatalogAnalyzer normRegionAnalyzer({ {} }, normRegionInfo, bamletWriter);
HtsFileSeeker htsFileSeeker(inputPaths.htsFile(), inputPaths.reference());
htshelpers::MateExtractor mateExtractor(inputPaths.htsFile(), inputPaths.reference());

CatalogAnalyzer normRegionAnalyzer({ {} }, normRegionInfo, bamletWriter);
std::vector<RegionDepthInfo> normDepthInfo;
for (RegionInfo regionInfo : normRegionInfo)
{
ReadPairs readPairs = collectReads({ regionInfo.region }, inputPaths.htsFile(), inputPaths.reference());
ReadPairs readPairs = collectReads({ regionInfo.region }, htsFileSeeker, mateExtractor);

for (const auto& fragmentIdAndReadPair : readPairs)
{
Expand Down Expand Up @@ -194,7 +194,7 @@ SampleFindings htsSeekingSampleAnalysis(
const auto& locusSpec = locusIdAndRegionSpec.second;

ReadPairs readPairs;
readPairs = collectReads(locusSpec->regionsWithReads(), inputPaths.htsFile(), inputPaths.reference());
readPairs = collectReads(locusSpec->regionsWithReads(), htsFileSeeker, mateExtractor);

CatalogAnalyzer catalogAnalyzer({ { locusId, locusSpec } }, std::vector<RegionInfo> {}, bamletWriter);

Expand Down
2 changes: 2 additions & 0 deletions sample_analysis/MateExtractor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,8 @@ namespace htshelpers

while (sam_itr_next(htsFilePtr_, htsRegionPtr_, htsAlignmentPtr_) >= 0)
{
if (htsAlignmentPtr_->core.pos != searchRegionStart) continue;

MappedRead putativeMate = htshelpers::decodeRead(htsAlignmentPtr_);

const bool belongToSameFragment = read.fragmentId() == putativeMate.fragmentId();
Expand Down
18 changes: 18 additions & 0 deletions variant_catalog/grch37/variant_catalog.json
Original file line number Diff line number Diff line change
Expand Up @@ -264,5 +264,23 @@
"LocusId": "TCF4",
"LocusStructure": "(CAG)*",
"ReferenceRegion": "18:53253386-53253458"
},
{
"VariantType": "Repeat",
"LocusId": "NIPA1",
"LocusStructure": "(CGC)*",
"ReferenceRegion": "15:23086366-23086390"
},
{
"LocusId": "GLS",
"LocusStructure": "(GGA)*",
"ReferenceRegion": "2:191745599-191745646 ",
"VariantType": "Repeat"
},
{
"LocusId": "RFC1",
"LocusStructure": "(AARRG)*",
"ReferenceRegion": "4:39350044-39350099",
"VariantType": "Repeat"
}
]
24 changes: 24 additions & 0 deletions variant_catalog/grch38/variant_catalog.json
Original file line number Diff line number Diff line change
Expand Up @@ -504,5 +504,29 @@
"LocusStructure": "(CAG)*",
"ReferenceRegion": "18:55586155-55586227",
"VariantType": "Repeat"
},
{
"VariantType": "Repeat",
"LocusId": "NIPA1",
"LocusStructure": "(GCG)*",
"ReferenceRegion": "15:22786677-22786701"
},
{
"LocusId": "NOTCH2NL",
"LocusStructure": "(GGC)*",
"ReferenceRegion": "1:149390802-149390841",
"VariantType": "Repeat"
},
{
"LocusId": "GLS",
"LocusStructure": "(GGA)*",
"ReferenceRegion": "2:190880873-190880920",
"VariantType": "Repeat"
},
{
"LocusId": "RFC1",
"LocusStructure": "(AARRG)*",
"ReferenceRegion": "4:39348424-39348479",
"VariantType": "Repeat"
}
]
18 changes: 18 additions & 0 deletions variant_catalog/hg19/variant_catalog.json
Original file line number Diff line number Diff line change
Expand Up @@ -264,5 +264,23 @@
"LocusId": "TCF4",
"LocusStructure": "(CAG)*",
"ReferenceRegion": "chr18:53253386-53253458"
},
{
"VariantType": "Repeat",
"LocusId": "NIPA1",
"LocusStructure": "(CGC)*",
"ReferenceRegion": "chr15:23086366-23086390"
},
{
"LocusId": "GLS",
"LocusStructure": "(GGA)*",
"ReferenceRegion": "chr2:191745599-191745646 ",
"VariantType": "Repeat"
},
{
"LocusId": "RFC1",
"LocusStructure": "(AARRG)*",
"ReferenceRegion": "chr4:39350044-39350099",
"VariantType": "Repeat"
}
]
24 changes: 24 additions & 0 deletions variant_catalog/hg38/variant_catalog.json
Original file line number Diff line number Diff line change
Expand Up @@ -504,5 +504,29 @@
"LocusStructure": "(CAG)*",
"ReferenceRegion": "chr18:55586155-55586227",
"VariantType": "Repeat"
},
{
"VariantType": "Repeat",
"LocusId": "NIPA1",
"LocusStructure": "(GCG)*",
"ReferenceRegion": "chr15:22786677-22786701"
},
{
"LocusId": "NOTCH2NL",
"LocusStructure": "(GGC)*",
"ReferenceRegion": "chr1:149390802-149390841",
"VariantType": "Repeat"
},
{
"LocusId": "GLS",
"LocusStructure": "(GGA)*",
"ReferenceRegion": "chr2:190880873-190880920",
"VariantType": "Repeat"
},
{
"LocusId": "RFC1",
"LocusStructure": "(AARRG)*",
"ReferenceRegion": "chr4:39348424-39348479",
"VariantType": "Repeat"
}
]

0 comments on commit 380e302

Please # to comment.