-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpseudoautosomal_region.py
executable file
·84 lines (72 loc) · 4.79 KB
/
pseudoautosomal_region.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
#!/usr/bin/env python3
__author__ = 'tomarovsky'
from Biocrutch.Statistics.coverage_statistics.CoverageMetrics import CoveragesMetrics
from Biocrutch.Statistics.PAR.coordinator import Coordinator
from Biocrutch.Statistics.PAR.filter import Filter
from Biocrutch.Routines.routine_functions import metaopen
from collections import Counter
from sys import stdin
import argparse
def coordinates_list_to_BED(scaffold_name: str, coordinates: list) -> str:
"""
function to create BED format from a list of coordinates
takes list [[start, stop], [start, stop]]
"""
result = ""
for lst in coordinates:
result += (scaffold_name + '\t' + str(lst[0]) + '\t' + str(lst[1]) + '\n')
return result
def main():
coordinates = Coordinator(args.input, float(args.whole_genome_value), args.region_gap_size, args.deviation_percent)
coordinates_and_medians = coordinates.get_coordinates(args.window_size,
args.coverage_column_name,
args.window_column_name,
args.repeat_window_number)
coordinates_list = coordinates_and_medians[0]
# print(f"Chains: {coordinates_list}")
medians_list = coordinates_and_medians[1]
# print(f"Medians between chains: {medians_list}")
print('---- Medians between regions ---- \n', medians_list, sep="")
print("Concatenate if the median >=", coordinates.minimum_coverage)
print('---- Raw coordinates ---- \n', coordinates_list_to_BED(args.scaffold_name, coordinates_list), sep="")
coordinates_merge_by_median = Filter.concat_by_median(coordinates_list, # coordinates list
medians_list, # median list between regions
coordinates.minimum_coverage,
coordinates.maximum_coverage)
print('---- Filtration by median ---- \n', coordinates_list_to_BED(args.scaffold_name, coordinates_merge_by_median), sep="")
coordinates_merge_by_distance = Filter.concat_by_distance(coordinates_merge_by_median, args.min_region_length)
print('---- Filtration by distance ---- \n', coordinates_list_to_BED(args.scaffold_name, coordinates_merge_by_distance), sep="")
if args.output:
outfile = open(args.output + "_pseudoreg.bed", "w")
outfile.writelines(coordinates_list_to_BED(args.scaffold_name, coordinates_merge_by_distance))
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Script for determining the coordinates of the PAR. Output of coordinates to BED file")
group_required = parser.add_argument_group('Required options')
group_required.add_argument('-i', '--input', type=lambda s: metaopen(s, "rt"),
help="input coverage_statistics_output.csv")
group_additional = parser.add_argument_group('Additional options')
group_additional.add_argument('-o', '--output', metavar='PATH', type=str, default=False,
help='output file prefix')
group_additional.add_argument('-f', '--window-size', type=int,
help="the window size used in your data")
group_additional.add_argument('--window_column_name', type=int,
help="number of column in coverage file with window number", default=1)
group_additional.add_argument('--coverage_column_name', type=int,
help="number of column in coverage file with mean/median coverage per window", default=2)
group_additional.add_argument('-s', '--scaffold-name', type=str,
help="name of column in coverage file with scaffold name", default="ChrX")
group_additional.add_argument('-m', '--whole_genome_value', type=str,
help="whole genome median/mean value")
group_additional.add_argument('-r', '--repeat_window_number', type=int,
help="number of repeating windows for a given condition", default=10)
group_additional.add_argument('-g', '--region_gap_size', type=int,
help="minimum allowable gap between regions for their merging (windows). If 0, then the median is calculated taking into account the next section, and not just the section between regions.",
default=0)
group_additional.add_argument('-l', '--min_region_length', type=int,
help="minimum distance between regions for their merging (nucleotides)",
default=0)
group_additional.add_argument('-d', '--deviation_percent', type=int,
help="measurement error", default=30)
args = parser.parse_args()
main()