annotate select_snps.py @ 0:939c20789501 default tip

Uploaded
author rico
date Fri, 06 Apr 2012 10:51:28 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
939c20789501 Uploaded
rico
parents:
diff changeset
1 #!/usr/bin/env python
939c20789501 Uploaded
rico
parents:
diff changeset
2
939c20789501 Uploaded
rico
parents:
diff changeset
3 import os
939c20789501 Uploaded
rico
parents:
diff changeset
4 import sys
939c20789501 Uploaded
rico
parents:
diff changeset
5 import math
939c20789501 Uploaded
rico
parents:
diff changeset
6 from optparse import OptionParser
939c20789501 Uploaded
rico
parents:
diff changeset
7 import genome_diversity as gd
939c20789501 Uploaded
rico
parents:
diff changeset
8
939c20789501 Uploaded
rico
parents:
diff changeset
9 def main_function(parse_arguments=None):
939c20789501 Uploaded
rico
parents:
diff changeset
10 if parse_arguments is None:
939c20789501 Uploaded
rico
parents:
diff changeset
11 parse_arguments = lambda arguments: (None, arguments)
939c20789501 Uploaded
rico
parents:
diff changeset
12 def main_decorator(to_decorate):
939c20789501 Uploaded
rico
parents:
diff changeset
13 def decorated_main(arguments=None):
939c20789501 Uploaded
rico
parents:
diff changeset
14 if arguments is None:
939c20789501 Uploaded
rico
parents:
diff changeset
15 arguments = sys.argv
939c20789501 Uploaded
rico
parents:
diff changeset
16 options, arguments = parse_arguments(arguments)
939c20789501 Uploaded
rico
parents:
diff changeset
17 sys.exit(to_decorate(options, arguments))
939c20789501 Uploaded
rico
parents:
diff changeset
18 return decorated_main
939c20789501 Uploaded
rico
parents:
diff changeset
19 return main_decorator
939c20789501 Uploaded
rico
parents:
diff changeset
20
939c20789501 Uploaded
rico
parents:
diff changeset
21 def parse_arguments(arguments):
939c20789501 Uploaded
rico
parents:
diff changeset
22 parser = OptionParser()
939c20789501 Uploaded
rico
parents:
diff changeset
23 parser.add_option('--input', dest='input')
939c20789501 Uploaded
rico
parents:
diff changeset
24 parser.add_option('--output', dest='output')
939c20789501 Uploaded
rico
parents:
diff changeset
25 parser.add_option('--index_dir', dest='index_dir')
939c20789501 Uploaded
rico
parents:
diff changeset
26 parser.add_option('--num_snps', dest='num_snps')
939c20789501 Uploaded
rico
parents:
diff changeset
27 parser.add_option('--ref_chrom_col', dest='ref_chrom_col')
939c20789501 Uploaded
rico
parents:
diff changeset
28 parser.add_option('--ref_pos_col', dest='ref_pos_col')
939c20789501 Uploaded
rico
parents:
diff changeset
29 parser.add_option('--ref_species', dest='ref_species')
939c20789501 Uploaded
rico
parents:
diff changeset
30 return parser.parse_args(arguments[1:])
939c20789501 Uploaded
rico
parents:
diff changeset
31
939c20789501 Uploaded
rico
parents:
diff changeset
32 @main_function(parse_arguments)
939c20789501 Uploaded
rico
parents:
diff changeset
33 def main(options, arguments):
939c20789501 Uploaded
rico
parents:
diff changeset
34
939c20789501 Uploaded
rico
parents:
diff changeset
35 ref_chrom_idx = to_int( options.ref_chrom_col ) -1
939c20789501 Uploaded
rico
parents:
diff changeset
36 ref_pos_idx = to_int( options.ref_pos_col ) -1
939c20789501 Uploaded
rico
parents:
diff changeset
37
939c20789501 Uploaded
rico
parents:
diff changeset
38 if (ref_chrom_idx < 1) or (ref_pos_idx < 1) or (ref_chrom_idx == ref_pos_idx):
939c20789501 Uploaded
rico
parents:
diff changeset
39 print >> sys.stderr, "Cannot locate reference genome sequence (ref) or reference genome position (rPos) column for this dataset."
939c20789501 Uploaded
rico
parents:
diff changeset
40 sys.exit(1)
939c20789501 Uploaded
rico
parents:
diff changeset
41
939c20789501 Uploaded
rico
parents:
diff changeset
42 chrom_len_root = os.path.join( options.index_dir, 'shared/ucsc/chrom')
939c20789501 Uploaded
rico
parents:
diff changeset
43 chrom_len_file = '%s.len' % options.ref_species
939c20789501 Uploaded
rico
parents:
diff changeset
44 chrom_len_path = os.path.join(chrom_len_root, chrom_len_file)
939c20789501 Uploaded
rico
parents:
diff changeset
45
939c20789501 Uploaded
rico
parents:
diff changeset
46 chrlens = gd.ChrLens( chrom_len_path )
939c20789501 Uploaded
rico
parents:
diff changeset
47
939c20789501 Uploaded
rico
parents:
diff changeset
48 total_len = 0
939c20789501 Uploaded
rico
parents:
diff changeset
49 for chrom in chrlens:
939c20789501 Uploaded
rico
parents:
diff changeset
50 total_len += chrlens.length(chrom)
939c20789501 Uploaded
rico
parents:
diff changeset
51
939c20789501 Uploaded
rico
parents:
diff changeset
52 total_requested = int( options.num_snps )
939c20789501 Uploaded
rico
parents:
diff changeset
53 lines, data, comments = get_snp_lines_data_and_comments( options.input, ref_chrom_idx, ref_pos_idx )
939c20789501 Uploaded
rico
parents:
diff changeset
54 selected = select_snps( data, total_len, total_requested )
939c20789501 Uploaded
rico
parents:
diff changeset
55 out_data = fix_selection_and_order_like_input(data, selected, total_requested)
939c20789501 Uploaded
rico
parents:
diff changeset
56 write_selected_snps( options.output, out_data, lines, comments )
939c20789501 Uploaded
rico
parents:
diff changeset
57
939c20789501 Uploaded
rico
parents:
diff changeset
58 def to_int( value ):
939c20789501 Uploaded
rico
parents:
diff changeset
59 try:
939c20789501 Uploaded
rico
parents:
diff changeset
60 int_value = int( value )
939c20789501 Uploaded
rico
parents:
diff changeset
61 except ValueError:
939c20789501 Uploaded
rico
parents:
diff changeset
62 int_value = 0
939c20789501 Uploaded
rico
parents:
diff changeset
63 return int_value
939c20789501 Uploaded
rico
parents:
diff changeset
64
939c20789501 Uploaded
rico
parents:
diff changeset
65 def get_snp_lines_data_and_comments( filename, chrom_idx, pos_idx ):
939c20789501 Uploaded
rico
parents:
diff changeset
66 fh = open( filename, 'r' )
939c20789501 Uploaded
rico
parents:
diff changeset
67 if (chrom_idx >= pos_idx):
939c20789501 Uploaded
rico
parents:
diff changeset
68 needed = chrom_idx + 1
939c20789501 Uploaded
rico
parents:
diff changeset
69 else:
939c20789501 Uploaded
rico
parents:
diff changeset
70 needed = pos_idx + 1
939c20789501 Uploaded
rico
parents:
diff changeset
71 lines = []
939c20789501 Uploaded
rico
parents:
diff changeset
72 data = []
939c20789501 Uploaded
rico
parents:
diff changeset
73 comments = []
939c20789501 Uploaded
rico
parents:
diff changeset
74 line_idx = 0
939c20789501 Uploaded
rico
parents:
diff changeset
75 line_num = 0
939c20789501 Uploaded
rico
parents:
diff changeset
76 for line in fh:
939c20789501 Uploaded
rico
parents:
diff changeset
77 line_num += 1
939c20789501 Uploaded
rico
parents:
diff changeset
78 line = line.rstrip('\r\n')
939c20789501 Uploaded
rico
parents:
diff changeset
79 if line:
939c20789501 Uploaded
rico
parents:
diff changeset
80 if line.startswith('#'):
939c20789501 Uploaded
rico
parents:
diff changeset
81 comments.append(line)
939c20789501 Uploaded
rico
parents:
diff changeset
82 else:
939c20789501 Uploaded
rico
parents:
diff changeset
83 elems = line.split('\t')
939c20789501 Uploaded
rico
parents:
diff changeset
84 if len(elems) >= needed:
939c20789501 Uploaded
rico
parents:
diff changeset
85 chrom = elems[chrom_idx]
939c20789501 Uploaded
rico
parents:
diff changeset
86 try:
939c20789501 Uploaded
rico
parents:
diff changeset
87 pos = int(elems[pos_idx])
939c20789501 Uploaded
rico
parents:
diff changeset
88 except ValueError:
939c20789501 Uploaded
rico
parents:
diff changeset
89 sys.stderr.write( "bad reference position in line %d column %d: %s\n" % ( line_num, pos_idx+1, elems[pos_idx] ) )
939c20789501 Uploaded
rico
parents:
diff changeset
90 sys.exit(1)
939c20789501 Uploaded
rico
parents:
diff changeset
91 lines.append(line)
939c20789501 Uploaded
rico
parents:
diff changeset
92 chrom_sort = chrom.lstrip('chr')
939c20789501 Uploaded
rico
parents:
diff changeset
93 data.append( [chrom_sort, chrom, pos, line_num, line_idx] )
939c20789501 Uploaded
rico
parents:
diff changeset
94 line_idx += 1
939c20789501 Uploaded
rico
parents:
diff changeset
95 fh.close()
939c20789501 Uploaded
rico
parents:
diff changeset
96 data = sorted( data, key=lambda x: (x[0], x[2]) )
939c20789501 Uploaded
rico
parents:
diff changeset
97 return lines, data, comments
939c20789501 Uploaded
rico
parents:
diff changeset
98
939c20789501 Uploaded
rico
parents:
diff changeset
99 def select_snps( data, total_len, requested ):
939c20789501 Uploaded
rico
parents:
diff changeset
100 old_chrom = None
939c20789501 Uploaded
rico
parents:
diff changeset
101 next_print = 0
939c20789501 Uploaded
rico
parents:
diff changeset
102 selected = []
939c20789501 Uploaded
rico
parents:
diff changeset
103 space = total_len / requested
939c20789501 Uploaded
rico
parents:
diff changeset
104 for data_idx, datum in enumerate( data ):
939c20789501 Uploaded
rico
parents:
diff changeset
105 chrom = datum[1]
939c20789501 Uploaded
rico
parents:
diff changeset
106 pos = datum[2]
939c20789501 Uploaded
rico
parents:
diff changeset
107 if chrom != old_chrom:
939c20789501 Uploaded
rico
parents:
diff changeset
108 old_chrom = chrom
939c20789501 Uploaded
rico
parents:
diff changeset
109 next_print = 0
939c20789501 Uploaded
rico
parents:
diff changeset
110 if pos >= next_print:
939c20789501 Uploaded
rico
parents:
diff changeset
111 selected.append(data_idx)
939c20789501 Uploaded
rico
parents:
diff changeset
112 next_print += space
939c20789501 Uploaded
rico
parents:
diff changeset
113 return selected
939c20789501 Uploaded
rico
parents:
diff changeset
114
939c20789501 Uploaded
rico
parents:
diff changeset
115 def fix_selection_and_order_like_input(data, selected, requested):
939c20789501 Uploaded
rico
parents:
diff changeset
116 total_selected = len( selected )
939c20789501 Uploaded
rico
parents:
diff changeset
117 a = float( total_selected ) / requested
939c20789501 Uploaded
rico
parents:
diff changeset
118 b = a / 2
939c20789501 Uploaded
rico
parents:
diff changeset
119
939c20789501 Uploaded
rico
parents:
diff changeset
120 idx_list = []
939c20789501 Uploaded
rico
parents:
diff changeset
121 for i in range( requested ):
939c20789501 Uploaded
rico
parents:
diff changeset
122 idx = int( math.ceil( i * a + b ) - 1 )
939c20789501 Uploaded
rico
parents:
diff changeset
123 idx_list.append( idx )
939c20789501 Uploaded
rico
parents:
diff changeset
124
939c20789501 Uploaded
rico
parents:
diff changeset
125 out_data = []
939c20789501 Uploaded
rico
parents:
diff changeset
126
939c20789501 Uploaded
rico
parents:
diff changeset
127 for i, data_idx in enumerate(selected):
939c20789501 Uploaded
rico
parents:
diff changeset
128 if total_selected > requested:
939c20789501 Uploaded
rico
parents:
diff changeset
129 if i in idx_list:
939c20789501 Uploaded
rico
parents:
diff changeset
130 out_data.append(data[data_idx])
939c20789501 Uploaded
rico
parents:
diff changeset
131 else:
939c20789501 Uploaded
rico
parents:
diff changeset
132 out_data.append(data[data_idx])
939c20789501 Uploaded
rico
parents:
diff changeset
133
939c20789501 Uploaded
rico
parents:
diff changeset
134 out_data = sorted( out_data, key=lambda x: x[3] )
939c20789501 Uploaded
rico
parents:
diff changeset
135
939c20789501 Uploaded
rico
parents:
diff changeset
136 return out_data
939c20789501 Uploaded
rico
parents:
diff changeset
137
939c20789501 Uploaded
rico
parents:
diff changeset
138 def write_selected_snps( filename, data, lines, comments ):
939c20789501 Uploaded
rico
parents:
diff changeset
139 fh = open( filename, 'w' )
939c20789501 Uploaded
rico
parents:
diff changeset
140
939c20789501 Uploaded
rico
parents:
diff changeset
141 for comment in comments:
939c20789501 Uploaded
rico
parents:
diff changeset
142 fh.write("%s\n" % comment )
939c20789501 Uploaded
rico
parents:
diff changeset
143
939c20789501 Uploaded
rico
parents:
diff changeset
144 for datum in data:
939c20789501 Uploaded
rico
parents:
diff changeset
145 line_idx = datum[4]
939c20789501 Uploaded
rico
parents:
diff changeset
146 fh.write("%s\n" % lines[line_idx])
939c20789501 Uploaded
rico
parents:
diff changeset
147
939c20789501 Uploaded
rico
parents:
diff changeset
148 fh.close()
939c20789501 Uploaded
rico
parents:
diff changeset
149
939c20789501 Uploaded
rico
parents:
diff changeset
150 if __name__ == "__main__":
939c20789501 Uploaded
rico
parents:
diff changeset
151 main()
939c20789501 Uploaded
rico
parents:
diff changeset
152
939c20789501 Uploaded
rico
parents:
diff changeset
153