-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #280 from maize-genetics/read-mapping-target-counts
Read mapping target counts QC
- Loading branch information
Showing
4 changed files
with
418 additions
and
1 deletion.
There are no files selected for viewing
41 changes: 41 additions & 0 deletions
41
data/test/kmerReadMapping/LineA_1_readMapping_LineA_counts.txt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
refRange LineA_HapID LineA_HapCount HighestAltCount Difference OtherHapCounts | ||
1:1-1000 12f0cec9102e84a161866e37072443b7 853 0 853 0_4fc7b8af32ddd74e07cb49d147ef1938, 0_546d1839623a5b0ea98bbff9a8a320e2 | ||
1:1001-5500 3149b3144f93134eb29661bade697fc6 4378 70 4308 70_8967fabf10e55d881caa6fe192e7d4ca, 0_57705b1e2541c7634ea59a48fc52026f | ||
1:5501-6500 1b568197f6f329ec5b71f66e49a732fb 855 47 808 47_05efe15d97db33185b64821791b01b0f, 0_d896e9cc56e74f39fd3f3c665191d727 | ||
1:6501-11000 369464a8743d2e40ad83d1375c196bdd 4355 101 4254 101_8f7de1a693aa15fb8fb7b85e7a8b5e95, 24_66465399052d8ebe48b06329c60fee03 | ||
1:11001-12000 f50fe6d6b3d9a9d305889db977969916 855 0 855 0_6b5f46bd5c31917af3ab6c3ccc8668cd, 0_2b4590f722ef9229c15d29e0b4e51a0e | ||
1:12001-16500 d4c8b5505d7046b41d7f69b246063ebb 4283 80 4203 80_aff71f19de448514a6d9208b1fcb4e8a, 32_f2a26406aaa6aefdc20ccdf8cb313dff | ||
1:16501-17500 18498579d89483ac270e8cca57f34752 855 38 817 38_fca8c375152e127d322a99bbd10e9873, 0_e07d04d2fc96b0f8f075aa5641785f78 | ||
1:17501-22000 8e5dbf7df5b265ba5daf35c4dfdf8a39 4302 58 4244 58_bcb3fa7872cb54537cdf34c96b01ed15, 6_90919d6d6f101925c82108fb86ec0621 | ||
1:22001-23000 cfa288e041fc4dbe299d20ae1920d258 855 27 828 27_a92a239d3a5652d3b9daba7de6febb97, 0_dbf891359a4c3f8da75fbc602bcbd631 | ||
1:23001-27500 f3bb0c19ab2a88d82cf79f9311ffa61f 4329 112 4217 112_164098aa7cfeff45d651fd5554538bbe, 0_39c0c13208f7990489337408cf980b79 | ||
1:27501-28500 64747625066ac7d3477cca8ecfa46ed1 855 67 788 67_105c85412229b45439db1f03c3f064f4, 3_ea27ac2def0e1d80c59cdb38b6dfa66a | ||
1:28501-33000 f2e5593769729ebb8aab356ba78a7e3a 4324 49 4275 49_674dda50f4e91de9d4a88cc341173974, 0_00f297caa4a0fa5a6f8e76d388393fa7 | ||
1:33001-34000 5a2ff3fb844d5647987da5c194d1c728 855 0 855 0_79146831745c85d26117f13b5873935f, 0_7895054835ae364b35e7c4a805610232 | ||
1:34001-38500 bb4b6391a180e13f4e2448674cba6d43 4394 81 4313 81_9650638ee16a4853495610120e1323f8, 29_25413a93cd7622fa44d015d6914f344d | ||
1:38501-39500 c255c2971f8d977c0407d6c7da877610 855 0 855 0_5d5381ccc68ac93826334f634578672e, 0_a7f279c629f06b3d666f07debaf2bbad | ||
1:39501-44000 c4d18d9fa581d3a791a3f2cfd40aefd6 4308 0 4308 0_18dcf42d7b02fd603e05e3714fa71e95, 0_073286a82fe47d6a370e8a7a3803f1d3 | ||
1:44001-45000 bfb9ef2dde7090760392e8b2701e13b4 855 93 762 93_ddbae40728836c4c245f9ccb7e651515, 0_be9ca4fe6c9bc665702ab98c8e7593fe | ||
1:45001-49500 5ae13475e95253ee352dda4875dc9f7c 4355 27 4328 27_06ae4e937668d301e325d43725a38c3f, 0_7e388530fa03aa2ab21b3cd10a438f5a | ||
1:49501-50500 5be0e52ccfe573a6e42e4dd1e8658105 655 59 596 59_fafafee7c250c76b7e0571fde286022e, 33_a7214fe07512b511ddf13edade461b39 | ||
1:50501-55000 0 122 -122 122_f62948ca6d4385c880321727ba494bec | ||
2:1-1000 13417ecbb38b9a159e3ca8c9dade7088 853 0 853 0_180417a01edbfed525d7c238910e0ff4, 0_5812acb1aff74866003656316c4539a6 | ||
2:1001-5500 c16ac825052c0456069f6408652aadf8 4315 143 4172 143_799371fe289fbe0046797729659cb5ff, 0_8bcf66e8c49da2d9ad8cbafa0bb7a93d | ||
2:5501-6500 50044914d5111c5b5ec58c9d720e3b2d 855 0 855 0_45b121547c7ae517a181fdd2621495c4, 0_bbc9274ac9c98fd0f9399ec8f121b9d0 | ||
2:6501-11000 c472bb8d63e19218a4089821ae666db3 4335 91 4244 91_b8843efbd6adaa261a01518dc2a39aa2, 0_bc94073196b0b2c13e62b5fa47c76b51 | ||
2:11001-12000 3ec680649615da0685b8c245e0f196e2 855 0 855 0_b787382b1337fd694dd8d77de0141da4, 0_347f0478b1a553ef107243cb60a9ba7d | ||
2:12001-16500 184a72815a2ba5949635cc38769cedd0 4310 134 4176 134_6fb2de47c835bd9ab026c02d62f49807, 91_cb86faf105b192fd9281b7f8e00f4fae | ||
2:16501-17500 1e38bd82670c3f10982f70390c599a8d 855 24 831 24_db22dfc14799b1aa666eb7d571cf04ec, 0_d36a25dd15269876f5738efac5b6d34f | ||
2:17501-22000 dc3aea9f418eccd5ea6326c6d0d47e9c 4310 157 4153 157_f9f80587acc7c2ac0e973ed343f69cfe, 52_5be09ea29709972af27bf90073843f8d | ||
2:22001-23000 cf0acc0bc840779c02b16245aa59d35c 855 0 855 0_a8660522bc6d09bbb3ae7a60ea3ed64d, 0_43687e13112bbe841f811b0a9de82a94 | ||
2:23001-27500 74ffef78af08b12ba1c7876206570e26 4344 236 4108 236_261f7e63beed04e2500a9d8ae867e4ab, 31_9c467defbbea2e4be338eecd26c87e4c | ||
2:27501-28500 d915f009b3e02030ab68baf1c43c55ad 855 0 855 0_d65e22539028b240fab289fa1a070f3f, 0_9a863526c624a34d51c74f511756cfa4 | ||
2:28501-33000 c052e7e5a036bddc9f364324b203f1b3 4304 52 4252 52_8f2731708b30e1c402c6d6a69a983fe4, 14_e4f5f60483b68badd6caf4cc2812d2bc | ||
2:33001-34000 90248144d7173c1f1481008c43e65129 855 0 855 0_a5ae89082f94ae1a03a2dbcea1551e51, 0_6737c1cfff3c688e6c393fe405b51f6e | ||
2:34001-38500 ac9583b43abdb40e8da76e4dedccd534 4296 82 4214 82_a593b1c67f90cd58523762c11773dce3, 80_2c4b8564bbbdf70c6560fdefdbe3ef6a | ||
2:38501-39500 539d334104090326aa72b38e4b353cc0 855 0 855 0_4df3ac835af83a9b5bf80d4e4850627d, 0_b9f31fca37c7bd6cd05526a1ba567c94 | ||
2:39501-44000 51a185c08bf7fac70a60abfeaefc2c90 4207 158 4049 158_d8dd1f9c56425f5874687a1ec67a31e8, 0_c5c75ee44db641a9a1355b4a4cc7f4f1 | ||
2:44001-45000 d706ac56d212d11e1c12ec391c4ea01d 855 0 855 0_12ea83315f749ee2f2b38b631d8115f0, 0_ae5fd1aaa09c70d56e357fe319373758 | ||
2:45001-49500 565c967a46f12cd0bf458fe366cdc6db 4355 39 4316 39_c3415e16a664399ecec286ad7ae48e07, 17_c524a7c04519de87cb28b875146ac9ac | ||
2:49501-50500 0eb9029f3896313aebc69c8489923141 655 0 655 0_5031218d4ac709dd51a946acd0550356, 0_39f96726321b329964435865b3694fd2 | ||
2:50501-55000 0 0 0 0_105e63346a01d88e8339eddf9131c435 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
125 changes: 125 additions & 0 deletions
125
src/main/kotlin/net/maizegenetics/phgv2/pathing/ReadMappingCountQc.kt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,125 @@ | ||
package net.maizegenetics.phgv2.pathing | ||
|
||
import biokotlin.util.bufferedWriter | ||
import com.github.ajalt.clikt.core.CliktCommand | ||
import com.github.ajalt.clikt.parameters.options.option | ||
import com.github.ajalt.clikt.parameters.options.required | ||
import net.maizegenetics.phgv2.api.HaplotypeGraph | ||
import net.maizegenetics.phgv2.api.ReferenceRange | ||
import net.maizegenetics.phgv2.api.SampleGamete | ||
import net.maizegenetics.phgv2.cli.logCommand | ||
import org.apache.logging.log4j.LogManager | ||
import java.io.File | ||
|
||
class ReadMappingCountQc : CliktCommand("Read mapping count QC") { | ||
val myLogger = LogManager.getLogger(ReadMappingCountQc::class.java) | ||
|
||
val hvcfDir by option(help = "Directory with Haplotype VCF files") | ||
.required() | ||
|
||
val readMappingFile by option(help = "Read mapping file") | ||
.required() | ||
|
||
val targetSampleName by option(help = "Target sample name") | ||
.required() | ||
|
||
val outputDir by option(help = "Output directory") | ||
.required() | ||
|
||
override fun run() { | ||
logCommand(this) | ||
myLogger.info("Read mapping count QC") | ||
processReadMappingCounts(hvcfDir, readMappingFile, targetSampleName, outputDir) | ||
} | ||
fun processReadMappingCounts(hvcfDir: String, readMappingFile: String, targetSampleName: String, outputDir: String) { | ||
myLogger.info("Processing read mapping counts") | ||
val graph = HaplotypeGraph(hvcfDir) | ||
val hapIdToSampleGamete = graph.hapIdsToSampleGametes() | ||
val rangeToHapIds = graph.refRangeToHapIdList() | ||
|
||
val outputFile = buildCountOutputFile(outputDir, readMappingFile ,targetSampleName) | ||
|
||
val readMapping = AlignmentUtils.importReadMapping(readMappingFile) | ||
|
||
val hapIdCounts = convertReadMappingToHapIdCounts(readMapping) | ||
|
||
writeOutCounts(hapIdCounts, outputFile, targetSampleName, rangeToHapIds, hapIdToSampleGamete) | ||
} | ||
|
||
|
||
/** | ||
* Function to convert the read mappings into hapId -> count map so we could compare the target hapId to the actual counts | ||
*/ | ||
fun convertReadMappingToHapIdCounts(readMappings: Map<List<String>,Int>) : Map<String,Int> { | ||
//convert the hapIds into counts | ||
return readMappings.flatMap { (hapIds,count) -> | ||
hapIds.map { hapId -> Pair(hapId, count) } | ||
} | ||
.groupBy({ it.first }, { it.second }) | ||
.mapValues { it.value.sum() } | ||
} | ||
|
||
/** | ||
* Function to create an output file name for exporting the counts | ||
*/ | ||
fun buildCountOutputFile(outputDir: String, readMappingFile: String, targetSampleName: String) : String { | ||
//Get just the file name from the Mapping file without the extension | ||
return "$outputDir/${File(readMappingFile).nameWithoutExtension}_${targetSampleName}_counts.txt" | ||
} | ||
|
||
fun writeOutCounts(hapIdCounts: Map<String,Int>, | ||
outputFile: String, | ||
targetSampleName: String, | ||
rangeToHapId: Map<ReferenceRange, List<String>>, | ||
hapIdToSampleGamete: Map<String, List<SampleGamete>>) { | ||
myLogger.info("Writing out counts") | ||
bufferedWriter(outputFile).use { writer -> | ||
writer.write("refRange\t${targetSampleName}_HapID\t${targetSampleName}_HapCount\tHighestAltCount\tDifference\tOtherHapCounts\n") | ||
rangeToHapId.keys.sorted().forEach { range -> | ||
val outputString = buildOutputStringForHapIdsInRefRange(rangeToHapId, range, hapIdToSampleGamete, hapIdCounts, targetSampleName) | ||
writer.write(outputString) | ||
} | ||
} | ||
} | ||
|
||
fun buildOutputStringForHapIdsInRefRange( | ||
rangeToHapId: Map<ReferenceRange, List<String>>, | ||
range: ReferenceRange, | ||
hapIdToSampleGamete: Map<String, List<SampleGamete>>, | ||
hapIdCounts: Map<String, Int>, | ||
targetSampleName: String | ||
) : String { | ||
val hapIds = rangeToHapId[range]!! | ||
val targetHapId = findTargetHapIdForSample(hapIds, hapIdToSampleGamete, targetSampleName) | ||
|
||
val nonTargetCounts = computeCountsForNonTargetHapids(hapIds, targetHapId, hapIdCounts) | ||
|
||
val highestAlt = if (nonTargetCounts.isEmpty()) 0 else nonTargetCounts.first().first | ||
|
||
val targetCount = hapIdCounts.getOrDefault(targetHapId, 0) | ||
return "$range\t$targetHapId\t${targetCount}\t${highestAlt}\t${targetCount - highestAlt}\t" + | ||
"${nonTargetCounts.joinToString(", ") { "${it.first}_${it.second}" }}\n" | ||
|
||
} | ||
|
||
fun computeCountsForNonTargetHapids( | ||
hapIds: List<String>, | ||
targetHapId: String, | ||
hapIdCounts: Map<String, Int> | ||
) :List<Pair<Int,String>> { | ||
return hapIds.filter { it != targetHapId } | ||
.toSet() | ||
.map { hapId -> Pair(hapId, hapIdCounts.getOrDefault(hapId, 0)) } | ||
.sortedByDescending { it.second } | ||
.map { Pair(it.second, it.first) } | ||
|
||
} | ||
|
||
fun findTargetHapIdForSample( | ||
hapIds: List<String>, | ||
hapIdToSampleGamete: Map<String, List<SampleGamete>>, | ||
targetSampleName: String | ||
) : String { | ||
return hapIds.find { hapId -> hapIdToSampleGamete[hapId]!!.map { sg -> sg.name }.contains(targetSampleName) }?:"" | ||
} | ||
} |
Oops, something went wrong.