annotate select_snps.py @ 0:32f8fbdd754c default tip

Uploaded
author rico
date Fri, 06 Apr 2012 13:59:29 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
32f8fbdd754c Uploaded
rico
parents:
diff changeset
1 #!/usr/bin/env python
32f8fbdd754c Uploaded
rico
parents:
diff changeset
2
32f8fbdd754c Uploaded
rico
parents:
diff changeset
3 import os
32f8fbdd754c Uploaded
rico
parents:
diff changeset
4 import sys
32f8fbdd754c Uploaded
rico
parents:
diff changeset
5 import math
32f8fbdd754c Uploaded
rico
parents:
diff changeset
6 from optparse import OptionParser
32f8fbdd754c Uploaded
rico
parents:
diff changeset
7 import genome_diversity as gd
32f8fbdd754c Uploaded
rico
parents:
diff changeset
8
32f8fbdd754c Uploaded
rico
parents:
diff changeset
9 def main_function(parse_arguments=None):
32f8fbdd754c Uploaded
rico
parents:
diff changeset
10 if parse_arguments is None:
32f8fbdd754c Uploaded
rico
parents:
diff changeset
11 parse_arguments = lambda arguments: (None, arguments)
32f8fbdd754c Uploaded
rico
parents:
diff changeset
12 def main_decorator(to_decorate):
32f8fbdd754c Uploaded
rico
parents:
diff changeset
13 def decorated_main(arguments=None):
32f8fbdd754c Uploaded
rico
parents:
diff changeset
14 if arguments is None:
32f8fbdd754c Uploaded
rico
parents:
diff changeset
15 arguments = sys.argv
32f8fbdd754c Uploaded
rico
parents:
diff changeset
16 options, arguments = parse_arguments(arguments)
32f8fbdd754c Uploaded
rico
parents:
diff changeset
17 sys.exit(to_decorate(options, arguments))
32f8fbdd754c Uploaded
rico
parents:
diff changeset
18 return decorated_main
32f8fbdd754c Uploaded
rico
parents:
diff changeset
19 return main_decorator
32f8fbdd754c Uploaded
rico
parents:
diff changeset
20
32f8fbdd754c Uploaded
rico
parents:
diff changeset
21 def parse_arguments(arguments):
32f8fbdd754c Uploaded
rico
parents:
diff changeset
22 parser = OptionParser()
32f8fbdd754c Uploaded
rico
parents:
diff changeset
23 parser.add_option('--input', dest='input')
32f8fbdd754c Uploaded
rico
parents:
diff changeset
24 parser.add_option('--output', dest='output')
32f8fbdd754c Uploaded
rico
parents:
diff changeset
25 parser.add_option('--index_dir', dest='index_dir')
32f8fbdd754c Uploaded
rico
parents:
diff changeset
26 parser.add_option('--num_snps', dest='num_snps')
32f8fbdd754c Uploaded
rico
parents:
diff changeset
27 parser.add_option('--ref_chrom_col', dest='ref_chrom_col')
32f8fbdd754c Uploaded
rico
parents:
diff changeset
28 parser.add_option('--ref_pos_col', dest='ref_pos_col')
32f8fbdd754c Uploaded
rico
parents:
diff changeset
29 parser.add_option('--ref_species', dest='ref_species')
32f8fbdd754c Uploaded
rico
parents:
diff changeset
30 return parser.parse_args(arguments[1:])
32f8fbdd754c Uploaded
rico
parents:
diff changeset
31
32f8fbdd754c Uploaded
rico
parents:
diff changeset
32 @main_function(parse_arguments)
32f8fbdd754c Uploaded
rico
parents:
diff changeset
33 def main(options, arguments):
32f8fbdd754c Uploaded
rico
parents:
diff changeset
34
32f8fbdd754c Uploaded
rico
parents:
diff changeset
35 ref_chrom_idx = to_int( options.ref_chrom_col ) -1
32f8fbdd754c Uploaded
rico
parents:
diff changeset
36 ref_pos_idx = to_int( options.ref_pos_col ) -1
32f8fbdd754c Uploaded
rico
parents:
diff changeset
37
32f8fbdd754c Uploaded
rico
parents:
diff changeset
38 if (ref_chrom_idx < 1) or (ref_pos_idx < 1) or (ref_chrom_idx == ref_pos_idx):
32f8fbdd754c Uploaded
rico
parents:
diff changeset
39 print >> sys.stderr, "Cannot locate reference genome sequence (ref) or reference genome position (rPos) column for this dataset."
32f8fbdd754c Uploaded
rico
parents:
diff changeset
40 sys.exit(1)
32f8fbdd754c Uploaded
rico
parents:
diff changeset
41
32f8fbdd754c Uploaded
rico
parents:
diff changeset
42 chrom_len_root = os.path.join( options.index_dir, 'shared/ucsc/chrom')
32f8fbdd754c Uploaded
rico
parents:
diff changeset
43 chrom_len_file = '%s.len' % options.ref_species
32f8fbdd754c Uploaded
rico
parents:
diff changeset
44 chrom_len_path = os.path.join(chrom_len_root, chrom_len_file)
32f8fbdd754c Uploaded
rico
parents:
diff changeset
45
32f8fbdd754c Uploaded
rico
parents:
diff changeset
46 chrlens = gd.ChrLens( chrom_len_path )
32f8fbdd754c Uploaded
rico
parents:
diff changeset
47
32f8fbdd754c Uploaded
rico
parents:
diff changeset
48 total_len = 0
32f8fbdd754c Uploaded
rico
parents:
diff changeset
49 for chrom in chrlens:
32f8fbdd754c Uploaded
rico
parents:
diff changeset
50 total_len += chrlens.length(chrom)
32f8fbdd754c Uploaded
rico
parents:
diff changeset
51
32f8fbdd754c Uploaded
rico
parents:
diff changeset
52 total_requested = int( options.num_snps )
32f8fbdd754c Uploaded
rico
parents:
diff changeset
53 lines, data, comments = get_snp_lines_data_and_comments( options.input, ref_chrom_idx, ref_pos_idx )
32f8fbdd754c Uploaded
rico
parents:
diff changeset
54 selected = select_snps( data, total_len, total_requested )
32f8fbdd754c Uploaded
rico
parents:
diff changeset
55 out_data = fix_selection_and_order_like_input(data, selected, total_requested)
32f8fbdd754c Uploaded
rico
parents:
diff changeset
56 write_selected_snps( options.output, out_data, lines, comments )
32f8fbdd754c Uploaded
rico
parents:
diff changeset
57
32f8fbdd754c Uploaded
rico
parents:
diff changeset
58 def to_int( value ):
32f8fbdd754c Uploaded
rico
parents:
diff changeset
59 try:
32f8fbdd754c Uploaded
rico
parents:
diff changeset
60 int_value = int( value )
32f8fbdd754c Uploaded
rico
parents:
diff changeset
61 except ValueError:
32f8fbdd754c Uploaded
rico
parents:
diff changeset
62 int_value = 0
32f8fbdd754c Uploaded
rico
parents:
diff changeset
63 return int_value
32f8fbdd754c Uploaded
rico
parents:
diff changeset
64
32f8fbdd754c Uploaded
rico
parents:
diff changeset
65 def get_snp_lines_data_and_comments( filename, chrom_idx, pos_idx ):
32f8fbdd754c Uploaded
rico
parents:
diff changeset
66 fh = open( filename, 'r' )
32f8fbdd754c Uploaded
rico
parents:
diff changeset
67 if (chrom_idx >= pos_idx):
32f8fbdd754c Uploaded
rico
parents:
diff changeset
68 needed = chrom_idx + 1
32f8fbdd754c Uploaded
rico
parents:
diff changeset
69 else:
32f8fbdd754c Uploaded
rico
parents:
diff changeset
70 needed = pos_idx + 1
32f8fbdd754c Uploaded
rico
parents:
diff changeset
71 lines = []
32f8fbdd754c Uploaded
rico
parents:
diff changeset
72 data = []
32f8fbdd754c Uploaded
rico
parents:
diff changeset
73 comments = []
32f8fbdd754c Uploaded
rico
parents:
diff changeset
74 line_idx = 0
32f8fbdd754c Uploaded
rico
parents:
diff changeset
75 line_num = 0
32f8fbdd754c Uploaded
rico
parents:
diff changeset
76 for line in fh:
32f8fbdd754c Uploaded
rico
parents:
diff changeset
77 line_num += 1
32f8fbdd754c Uploaded
rico
parents:
diff changeset
78 line = line.rstrip('\r\n')
32f8fbdd754c Uploaded
rico
parents:
diff changeset
79 if line:
32f8fbdd754c Uploaded
rico
parents:
diff changeset
80 if line.startswith('#'):
32f8fbdd754c Uploaded
rico
parents:
diff changeset
81 comments.append(line)
32f8fbdd754c Uploaded
rico
parents:
diff changeset
82 else:
32f8fbdd754c Uploaded
rico
parents:
diff changeset
83 elems = line.split('\t')
32f8fbdd754c Uploaded
rico
parents:
diff changeset
84 if len(elems) >= needed:
32f8fbdd754c Uploaded
rico
parents:
diff changeset
85 chrom = elems[chrom_idx]
32f8fbdd754c Uploaded
rico
parents:
diff changeset
86 try:
32f8fbdd754c Uploaded
rico
parents:
diff changeset
87 pos = int(elems[pos_idx])
32f8fbdd754c Uploaded
rico
parents:
diff changeset
88 except ValueError:
32f8fbdd754c Uploaded
rico
parents:
diff changeset
89 sys.stderr.write( "bad reference position in line %d column %d: %s\n" % ( line_num, pos_idx+1, elems[pos_idx] ) )
32f8fbdd754c Uploaded
rico
parents:
diff changeset
90 sys.exit(1)
32f8fbdd754c Uploaded
rico
parents:
diff changeset
91 lines.append(line)
32f8fbdd754c Uploaded
rico
parents:
diff changeset
92 chrom_sort = chrom.lstrip('chr')
32f8fbdd754c Uploaded
rico
parents:
diff changeset
93 data.append( [chrom_sort, chrom, pos, line_num, line_idx] )
32f8fbdd754c Uploaded
rico
parents:
diff changeset
94 line_idx += 1
32f8fbdd754c Uploaded
rico
parents:
diff changeset
95 fh.close()
32f8fbdd754c Uploaded
rico
parents:
diff changeset
96 data = sorted( data, key=lambda x: (x[0], x[2]) )
32f8fbdd754c Uploaded
rico
parents:
diff changeset
97 return lines, data, comments
32f8fbdd754c Uploaded
rico
parents:
diff changeset
98
32f8fbdd754c Uploaded
rico
parents:
diff changeset
99 def select_snps( data, total_len, requested ):
32f8fbdd754c Uploaded
rico
parents:
diff changeset
100 old_chrom = None
32f8fbdd754c Uploaded
rico
parents:
diff changeset
101 next_print = 0
32f8fbdd754c Uploaded
rico
parents:
diff changeset
102 selected = []
32f8fbdd754c Uploaded
rico
parents:
diff changeset
103 space = total_len / requested
32f8fbdd754c Uploaded
rico
parents:
diff changeset
104 for data_idx, datum in enumerate( data ):
32f8fbdd754c Uploaded
rico
parents:
diff changeset
105 chrom = datum[1]
32f8fbdd754c Uploaded
rico
parents:
diff changeset
106 pos = datum[2]
32f8fbdd754c Uploaded
rico
parents:
diff changeset
107 if chrom != old_chrom:
32f8fbdd754c Uploaded
rico
parents:
diff changeset
108 old_chrom = chrom
32f8fbdd754c Uploaded
rico
parents:
diff changeset
109 next_print = 0
32f8fbdd754c Uploaded
rico
parents:
diff changeset
110 if pos >= next_print:
32f8fbdd754c Uploaded
rico
parents:
diff changeset
111 selected.append(data_idx)
32f8fbdd754c Uploaded
rico
parents:
diff changeset
112 next_print += space
32f8fbdd754c Uploaded
rico
parents:
diff changeset
113 return selected
32f8fbdd754c Uploaded
rico
parents:
diff changeset
114
32f8fbdd754c Uploaded
rico
parents:
diff changeset
115 def fix_selection_and_order_like_input(data, selected, requested):
32f8fbdd754c Uploaded
rico
parents:
diff changeset
116 total_selected = len( selected )
32f8fbdd754c Uploaded
rico
parents:
diff changeset
117 a = float( total_selected ) / requested
32f8fbdd754c Uploaded
rico
parents:
diff changeset
118 b = a / 2
32f8fbdd754c Uploaded
rico
parents:
diff changeset
119
32f8fbdd754c Uploaded
rico
parents:
diff changeset
120 idx_list = []
32f8fbdd754c Uploaded
rico
parents:
diff changeset
121 for i in range( requested ):
32f8fbdd754c Uploaded
rico
parents:
diff changeset
122 idx = int( math.ceil( i * a + b ) - 1 )
32f8fbdd754c Uploaded
rico
parents:
diff changeset
123 idx_list.append( idx )
32f8fbdd754c Uploaded
rico
parents:
diff changeset
124
32f8fbdd754c Uploaded
rico
parents:
diff changeset
125 out_data = []
32f8fbdd754c Uploaded
rico
parents:
diff changeset
126
32f8fbdd754c Uploaded
rico
parents:
diff changeset
127 for i, data_idx in enumerate(selected):
32f8fbdd754c Uploaded
rico
parents:
diff changeset
128 if total_selected > requested:
32f8fbdd754c Uploaded
rico
parents:
diff changeset
129 if i in idx_list:
32f8fbdd754c Uploaded
rico
parents:
diff changeset
130 out_data.append(data[data_idx])
32f8fbdd754c Uploaded
rico
parents:
diff changeset
131 else:
32f8fbdd754c Uploaded
rico
parents:
diff changeset
132 out_data.append(data[data_idx])
32f8fbdd754c Uploaded
rico
parents:
diff changeset
133
32f8fbdd754c Uploaded
rico
parents:
diff changeset
134 out_data = sorted( out_data, key=lambda x: x[3] )
32f8fbdd754c Uploaded
rico
parents:
diff changeset
135
32f8fbdd754c Uploaded
rico
parents:
diff changeset
136 return out_data
32f8fbdd754c Uploaded
rico
parents:
diff changeset
137
32f8fbdd754c Uploaded
rico
parents:
diff changeset
138 def write_selected_snps( filename, data, lines, comments ):
32f8fbdd754c Uploaded
rico
parents:
diff changeset
139 fh = open( filename, 'w' )
32f8fbdd754c Uploaded
rico
parents:
diff changeset
140
32f8fbdd754c Uploaded
rico
parents:
diff changeset
141 for comment in comments:
32f8fbdd754c Uploaded
rico
parents:
diff changeset
142 fh.write("%s\n" % comment )
32f8fbdd754c Uploaded
rico
parents:
diff changeset
143
32f8fbdd754c Uploaded
rico
parents:
diff changeset
144 for datum in data:
32f8fbdd754c Uploaded
rico
parents:
diff changeset
145 line_idx = datum[4]
32f8fbdd754c Uploaded
rico
parents:
diff changeset
146 fh.write("%s\n" % lines[line_idx])
32f8fbdd754c Uploaded
rico
parents:
diff changeset
147
32f8fbdd754c Uploaded
rico
parents:
diff changeset
148 fh.close()
32f8fbdd754c Uploaded
rico
parents:
diff changeset
149
32f8fbdd754c Uploaded
rico
parents:
diff changeset
150 if __name__ == "__main__":
32f8fbdd754c Uploaded
rico
parents:
diff changeset
151 main()
32f8fbdd754c Uploaded
rico
parents:
diff changeset
152
32f8fbdd754c Uploaded
rico
parents:
diff changeset
153