-
Notifications
You must be signed in to change notification settings - Fork 22
/
Copy pathcolumn_select.py
executable file
·93 lines (77 loc) · 3.39 KB
/
column_select.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
#!/usr/bin/env python
import argparse, sys
from argparse import RawTextHelpFormatter
__author__ = "Colby Chiang (cchiang@genome.wustl.edu)"
__version__ = "$Revision: 0.0.1 $"
__date__ = "$Date: 2015-08-11 10:59 $"
# --------------------------------------
# define functions
def get_args():
parser = argparse.ArgumentParser(formatter_class=RawTextHelpFormatter, description="\
column_select.py\n\
author: " + __author__ + "\n\
version: " + __version__ + "\n\
description: select columns from a file by header names")
parser.add_argument('-c', '--col', metavar='FILE', required=True, type=argparse.FileType('r'), help='list of column headers to extract')
parser.add_argument('-l', '--leading', metavar='INT', required=False, type=int, default=0, help='number of leading columns to print [0]')
parser.add_argument('-s', '--skiprows', metavar='INT', required=False, type=int, default=0, help='number of leading rows to print [0]')
parser.add_argument('-p', '--pass', metavar='STR', dest='pass_prefix', required=False, default=None, help='prefix for comment lines in INPUT to pass unfiltered')
parser.add_argument('-m', '--missing', metavar='STR', dest='missing_fill', type=str, required=False, default=None, help="fill missing columns with string (e.g.: NA)")
parser.add_argument('input', nargs='?', type=argparse.FileType('r'), default=None, help='phenotype file')
# parse the arguments
args = parser.parse_args()
# if no input, check if part of pipe and if so, read stdin.
if args.input == None:
if sys.stdin.isatty():
parser.print_help()
exit(1)
else:
args.input = sys.stdin
# send back the user input
return args
# primary function
def extract_cols(col, lead_cols, skip_rows, pass_prefix, missing_fill, source):
# get_columns = range(lead_cols)
select = []
for line in col:
select.append(line.rstrip())
in_header = True
skip_count = 0
for line in source:
if skip_count < skip_rows:
print line.rstrip()
skip_count += 1
continue
if pass_prefix is not None and line.startswith(pass_prefix):
print line.rstrip()
continue
v = line.rstrip().split('\t')
if in_header:
column_map = {c: v.index(c) for c in v}
if missing_fill is None:
get_columns = range(lead_cols) + [column_map[x] for x in select if x in column_map]
header_v = [v[x] for x in range(lead_cols) + [column_map[x] for x in select if x in column_map]]
else:
get_columns = range(lead_cols) + [column_map[x] if x in column_map else None for x in select ]
header_v = [v[x] for x in range(lead_cols)] + select
print '\t'.join(header_v)
in_header = False
else:
print '\t'.join(v[x] if x is not None else missing_fill for x in get_columns)
source.close()
col.close()
return
# --------------------------------------
# main function
def main():
# parse the command line args
args = get_args()
# call primary function
extract_cols(args.col, args.leading, args.skiprows, args.pass_prefix, args.missing_fill, args.input)
# initialize the script
if __name__ == '__main__':
try:
sys.exit(main())
except IOError, e:
if e.errno != 32: # ignore SIGPIPE
raise