-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathselect_full_reads.py
42 lines (31 loc) · 1.48 KB
/
select_full_reads.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
from __future__ import print_function
import pysam
import argparse
import csv
"""
A function to select reads for which there is a >= X length primary mapping
More specifically, the script measures the length of the reference section that is aligned to and compares this to the
cut-off
"""
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('bam', type=str, help="path to bamfile")
parser.add_argument('out', type=str, help="path to output file")
args = parser.parse_args()
# Cut-off for what counts as 'full length'
hbv_length = 3200
revcompl = lambda x: ''.join([{'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A'}[B] for B in x][::-1])
with pysam.AlignmentFile(args.bam, 'rb') as bamfile:
with open(args.out, 'w') as outfile:
# For each primary alignment, use pysam to get aligned length of the read on the reference genome
# Print details of full length reads to file
bam_iter = bamfile.fetch()
for read in bam_iter:
if not read.is_secondary: # this should also have been checked in the previous step of the pipeline
length = read.reference_length
if length >= hbv_length:
if read.is_reverse:
revcompl_flag=True
else:
revcompl_flag=False
print(read.query_name, "\t", length, "\t", revcompl_flag, file=outfile)