Skip to content

Commit

Permalink
Merge pull request #280 from maize-genetics/read-mapping-target-counts
Browse files Browse the repository at this point in the history
Read mapping target counts QC
  • Loading branch information
zrm22 authored Feb 17, 2025
2 parents 935067d + 5024f4c commit 71d1ec8
Show file tree
Hide file tree
Showing 4 changed files with 418 additions and 1 deletion.
41 changes: 41 additions & 0 deletions data/test/kmerReadMapping/LineA_1_readMapping_LineA_counts.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
refRange LineA_HapID LineA_HapCount HighestAltCount Difference OtherHapCounts
1:1-1000 12f0cec9102e84a161866e37072443b7 853 0 853 0_4fc7b8af32ddd74e07cb49d147ef1938, 0_546d1839623a5b0ea98bbff9a8a320e2
1:1001-5500 3149b3144f93134eb29661bade697fc6 4378 70 4308 70_8967fabf10e55d881caa6fe192e7d4ca, 0_57705b1e2541c7634ea59a48fc52026f
1:5501-6500 1b568197f6f329ec5b71f66e49a732fb 855 47 808 47_05efe15d97db33185b64821791b01b0f, 0_d896e9cc56e74f39fd3f3c665191d727
1:6501-11000 369464a8743d2e40ad83d1375c196bdd 4355 101 4254 101_8f7de1a693aa15fb8fb7b85e7a8b5e95, 24_66465399052d8ebe48b06329c60fee03
1:11001-12000 f50fe6d6b3d9a9d305889db977969916 855 0 855 0_6b5f46bd5c31917af3ab6c3ccc8668cd, 0_2b4590f722ef9229c15d29e0b4e51a0e
1:12001-16500 d4c8b5505d7046b41d7f69b246063ebb 4283 80 4203 80_aff71f19de448514a6d9208b1fcb4e8a, 32_f2a26406aaa6aefdc20ccdf8cb313dff
1:16501-17500 18498579d89483ac270e8cca57f34752 855 38 817 38_fca8c375152e127d322a99bbd10e9873, 0_e07d04d2fc96b0f8f075aa5641785f78
1:17501-22000 8e5dbf7df5b265ba5daf35c4dfdf8a39 4302 58 4244 58_bcb3fa7872cb54537cdf34c96b01ed15, 6_90919d6d6f101925c82108fb86ec0621
1:22001-23000 cfa288e041fc4dbe299d20ae1920d258 855 27 828 27_a92a239d3a5652d3b9daba7de6febb97, 0_dbf891359a4c3f8da75fbc602bcbd631
1:23001-27500 f3bb0c19ab2a88d82cf79f9311ffa61f 4329 112 4217 112_164098aa7cfeff45d651fd5554538bbe, 0_39c0c13208f7990489337408cf980b79
1:27501-28500 64747625066ac7d3477cca8ecfa46ed1 855 67 788 67_105c85412229b45439db1f03c3f064f4, 3_ea27ac2def0e1d80c59cdb38b6dfa66a
1:28501-33000 f2e5593769729ebb8aab356ba78a7e3a 4324 49 4275 49_674dda50f4e91de9d4a88cc341173974, 0_00f297caa4a0fa5a6f8e76d388393fa7
1:33001-34000 5a2ff3fb844d5647987da5c194d1c728 855 0 855 0_79146831745c85d26117f13b5873935f, 0_7895054835ae364b35e7c4a805610232
1:34001-38500 bb4b6391a180e13f4e2448674cba6d43 4394 81 4313 81_9650638ee16a4853495610120e1323f8, 29_25413a93cd7622fa44d015d6914f344d
1:38501-39500 c255c2971f8d977c0407d6c7da877610 855 0 855 0_5d5381ccc68ac93826334f634578672e, 0_a7f279c629f06b3d666f07debaf2bbad
1:39501-44000 c4d18d9fa581d3a791a3f2cfd40aefd6 4308 0 4308 0_18dcf42d7b02fd603e05e3714fa71e95, 0_073286a82fe47d6a370e8a7a3803f1d3
1:44001-45000 bfb9ef2dde7090760392e8b2701e13b4 855 93 762 93_ddbae40728836c4c245f9ccb7e651515, 0_be9ca4fe6c9bc665702ab98c8e7593fe
1:45001-49500 5ae13475e95253ee352dda4875dc9f7c 4355 27 4328 27_06ae4e937668d301e325d43725a38c3f, 0_7e388530fa03aa2ab21b3cd10a438f5a
1:49501-50500 5be0e52ccfe573a6e42e4dd1e8658105 655 59 596 59_fafafee7c250c76b7e0571fde286022e, 33_a7214fe07512b511ddf13edade461b39
1:50501-55000 0 122 -122 122_f62948ca6d4385c880321727ba494bec
2:1-1000 13417ecbb38b9a159e3ca8c9dade7088 853 0 853 0_180417a01edbfed525d7c238910e0ff4, 0_5812acb1aff74866003656316c4539a6
2:1001-5500 c16ac825052c0456069f6408652aadf8 4315 143 4172 143_799371fe289fbe0046797729659cb5ff, 0_8bcf66e8c49da2d9ad8cbafa0bb7a93d
2:5501-6500 50044914d5111c5b5ec58c9d720e3b2d 855 0 855 0_45b121547c7ae517a181fdd2621495c4, 0_bbc9274ac9c98fd0f9399ec8f121b9d0
2:6501-11000 c472bb8d63e19218a4089821ae666db3 4335 91 4244 91_b8843efbd6adaa261a01518dc2a39aa2, 0_bc94073196b0b2c13e62b5fa47c76b51
2:11001-12000 3ec680649615da0685b8c245e0f196e2 855 0 855 0_b787382b1337fd694dd8d77de0141da4, 0_347f0478b1a553ef107243cb60a9ba7d
2:12001-16500 184a72815a2ba5949635cc38769cedd0 4310 134 4176 134_6fb2de47c835bd9ab026c02d62f49807, 91_cb86faf105b192fd9281b7f8e00f4fae
2:16501-17500 1e38bd82670c3f10982f70390c599a8d 855 24 831 24_db22dfc14799b1aa666eb7d571cf04ec, 0_d36a25dd15269876f5738efac5b6d34f
2:17501-22000 dc3aea9f418eccd5ea6326c6d0d47e9c 4310 157 4153 157_f9f80587acc7c2ac0e973ed343f69cfe, 52_5be09ea29709972af27bf90073843f8d
2:22001-23000 cf0acc0bc840779c02b16245aa59d35c 855 0 855 0_a8660522bc6d09bbb3ae7a60ea3ed64d, 0_43687e13112bbe841f811b0a9de82a94
2:23001-27500 74ffef78af08b12ba1c7876206570e26 4344 236 4108 236_261f7e63beed04e2500a9d8ae867e4ab, 31_9c467defbbea2e4be338eecd26c87e4c
2:27501-28500 d915f009b3e02030ab68baf1c43c55ad 855 0 855 0_d65e22539028b240fab289fa1a070f3f, 0_9a863526c624a34d51c74f511756cfa4
2:28501-33000 c052e7e5a036bddc9f364324b203f1b3 4304 52 4252 52_8f2731708b30e1c402c6d6a69a983fe4, 14_e4f5f60483b68badd6caf4cc2812d2bc
2:33001-34000 90248144d7173c1f1481008c43e65129 855 0 855 0_a5ae89082f94ae1a03a2dbcea1551e51, 0_6737c1cfff3c688e6c393fe405b51f6e
2:34001-38500 ac9583b43abdb40e8da76e4dedccd534 4296 82 4214 82_a593b1c67f90cd58523762c11773dce3, 80_2c4b8564bbbdf70c6560fdefdbe3ef6a
2:38501-39500 539d334104090326aa72b38e4b353cc0 855 0 855 0_4df3ac835af83a9b5bf80d4e4850627d, 0_b9f31fca37c7bd6cd05526a1ba567c94
2:39501-44000 51a185c08bf7fac70a60abfeaefc2c90 4207 158 4049 158_d8dd1f9c56425f5874687a1ec67a31e8, 0_c5c75ee44db641a9a1355b4a4cc7f4f1
2:44001-45000 d706ac56d212d11e1c12ec391c4ea01d 855 0 855 0_12ea83315f749ee2f2b38b631d8115f0, 0_ae5fd1aaa09c70d56e357fe319373758
2:45001-49500 565c967a46f12cd0bf458fe366cdc6db 4355 39 4316 39_c3415e16a664399ecec286ad7ae48e07, 17_c524a7c04519de87cb28b875146ac9ac
2:49501-50500 0eb9029f3896313aebc69c8489923141 655 0 655 0_5031218d4ac709dd51a946acd0550356, 0_39f96726321b329964435865b3694fd2
2:50501-55000 0 0 0 0_105e63346a01d88e8339eddf9131c435
2 changes: 1 addition & 1 deletion src/main/kotlin/net/maizegenetics/phgv2/cli/Phg.kt
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ fun main(args: Array<String>) = Phg()
RopeBwtIndex(), MapReads(), // Imputation
ConvertRm2Ps4gFile(), ConvertRopebwt2Ps4gFile(), // PS4G File creations.
CreateFastaFromHvcf(), ListSamples(), MergeHvcfs(), MergeGVCFs(), CalcVcfMetrics(), StartServer, ExtractEdgeReads(), //Utilities
QcReadMapping(), PathsToGff(), // Utilities continued
QcReadMapping(), ReadMappingCountQc(), PathsToGff(), // Utilities continued
CompositeToHaplotypeCoords(), // resequencing pipeline
InitHvcfArray(), LoadHvcf(), QueryHvcfArrays() // hvcf loading
,ConvertRm2Ps4gFile()
Expand Down
125 changes: 125 additions & 0 deletions src/main/kotlin/net/maizegenetics/phgv2/pathing/ReadMappingCountQc.kt
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
package net.maizegenetics.phgv2.pathing

import biokotlin.util.bufferedWriter
import com.github.ajalt.clikt.core.CliktCommand
import com.github.ajalt.clikt.parameters.options.option
import com.github.ajalt.clikt.parameters.options.required
import net.maizegenetics.phgv2.api.HaplotypeGraph
import net.maizegenetics.phgv2.api.ReferenceRange
import net.maizegenetics.phgv2.api.SampleGamete
import net.maizegenetics.phgv2.cli.logCommand
import org.apache.logging.log4j.LogManager
import java.io.File

class ReadMappingCountQc : CliktCommand("Read mapping count QC") {
val myLogger = LogManager.getLogger(ReadMappingCountQc::class.java)

val hvcfDir by option(help = "Directory with Haplotype VCF files")
.required()

val readMappingFile by option(help = "Read mapping file")
.required()

val targetSampleName by option(help = "Target sample name")
.required()

val outputDir by option(help = "Output directory")
.required()

override fun run() {
logCommand(this)
myLogger.info("Read mapping count QC")
processReadMappingCounts(hvcfDir, readMappingFile, targetSampleName, outputDir)
}
fun processReadMappingCounts(hvcfDir: String, readMappingFile: String, targetSampleName: String, outputDir: String) {
myLogger.info("Processing read mapping counts")
val graph = HaplotypeGraph(hvcfDir)
val hapIdToSampleGamete = graph.hapIdsToSampleGametes()
val rangeToHapIds = graph.refRangeToHapIdList()

val outputFile = buildCountOutputFile(outputDir, readMappingFile ,targetSampleName)

val readMapping = AlignmentUtils.importReadMapping(readMappingFile)

val hapIdCounts = convertReadMappingToHapIdCounts(readMapping)

writeOutCounts(hapIdCounts, outputFile, targetSampleName, rangeToHapIds, hapIdToSampleGamete)
}


/**
* Function to convert the read mappings into hapId -> count map so we could compare the target hapId to the actual counts
*/
fun convertReadMappingToHapIdCounts(readMappings: Map<List<String>,Int>) : Map<String,Int> {
//convert the hapIds into counts
return readMappings.flatMap { (hapIds,count) ->
hapIds.map { hapId -> Pair(hapId, count) }
}
.groupBy({ it.first }, { it.second })
.mapValues { it.value.sum() }
}

/**
* Function to create an output file name for exporting the counts
*/
fun buildCountOutputFile(outputDir: String, readMappingFile: String, targetSampleName: String) : String {
//Get just the file name from the Mapping file without the extension
return "$outputDir/${File(readMappingFile).nameWithoutExtension}_${targetSampleName}_counts.txt"
}

fun writeOutCounts(hapIdCounts: Map<String,Int>,
outputFile: String,
targetSampleName: String,
rangeToHapId: Map<ReferenceRange, List<String>>,
hapIdToSampleGamete: Map<String, List<SampleGamete>>) {
myLogger.info("Writing out counts")
bufferedWriter(outputFile).use { writer ->
writer.write("refRange\t${targetSampleName}_HapID\t${targetSampleName}_HapCount\tHighestAltCount\tDifference\tOtherHapCounts\n")
rangeToHapId.keys.sorted().forEach { range ->
val outputString = buildOutputStringForHapIdsInRefRange(rangeToHapId, range, hapIdToSampleGamete, hapIdCounts, targetSampleName)
writer.write(outputString)
}
}
}

fun buildOutputStringForHapIdsInRefRange(
rangeToHapId: Map<ReferenceRange, List<String>>,
range: ReferenceRange,
hapIdToSampleGamete: Map<String, List<SampleGamete>>,
hapIdCounts: Map<String, Int>,
targetSampleName: String
) : String {
val hapIds = rangeToHapId[range]!!
val targetHapId = findTargetHapIdForSample(hapIds, hapIdToSampleGamete, targetSampleName)

val nonTargetCounts = computeCountsForNonTargetHapids(hapIds, targetHapId, hapIdCounts)

val highestAlt = if (nonTargetCounts.isEmpty()) 0 else nonTargetCounts.first().first

val targetCount = hapIdCounts.getOrDefault(targetHapId, 0)
return "$range\t$targetHapId\t${targetCount}\t${highestAlt}\t${targetCount - highestAlt}\t" +
"${nonTargetCounts.joinToString(", ") { "${it.first}_${it.second}" }}\n"

}

fun computeCountsForNonTargetHapids(
hapIds: List<String>,
targetHapId: String,
hapIdCounts: Map<String, Int>
) :List<Pair<Int,String>> {
return hapIds.filter { it != targetHapId }
.toSet()
.map { hapId -> Pair(hapId, hapIdCounts.getOrDefault(hapId, 0)) }
.sortedByDescending { it.second }
.map { Pair(it.second, it.first) }

}

fun findTargetHapIdForSample(
hapIds: List<String>,
hapIdToSampleGamete: Map<String, List<SampleGamete>>,
targetSampleName: String
) : String {
return hapIds.find { hapId -> hapIdToSampleGamete[hapId]!!.map { sg -> sg.name }.contains(targetSampleName) }?:""
}
}
Loading

0 comments on commit 71d1ec8

Please # to comment.