annotate select_snps.py @ 0:580da578c5e6 default tip

Uploaded
author rico
date Thu, 05 Apr 2012 15:56:36 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
580da578c5e6 Uploaded
rico
parents:
diff changeset
1 #!/usr/bin/env python
580da578c5e6 Uploaded
rico
parents:
diff changeset
2
580da578c5e6 Uploaded
rico
parents:
diff changeset
3 import os
580da578c5e6 Uploaded
rico
parents:
diff changeset
4 import sys
580da578c5e6 Uploaded
rico
parents:
diff changeset
5 import math
580da578c5e6 Uploaded
rico
parents:
diff changeset
6 from optparse import OptionParser
580da578c5e6 Uploaded
rico
parents:
diff changeset
7 import genome_diversity as gd
580da578c5e6 Uploaded
rico
parents:
diff changeset
8
580da578c5e6 Uploaded
rico
parents:
diff changeset
9 def main_function(parse_arguments=None):
580da578c5e6 Uploaded
rico
parents:
diff changeset
10 if parse_arguments is None:
580da578c5e6 Uploaded
rico
parents:
diff changeset
11 parse_arguments = lambda arguments: (None, arguments)
580da578c5e6 Uploaded
rico
parents:
diff changeset
12 def main_decorator(to_decorate):
580da578c5e6 Uploaded
rico
parents:
diff changeset
13 def decorated_main(arguments=None):
580da578c5e6 Uploaded
rico
parents:
diff changeset
14 if arguments is None:
580da578c5e6 Uploaded
rico
parents:
diff changeset
15 arguments = sys.argv
580da578c5e6 Uploaded
rico
parents:
diff changeset
16 options, arguments = parse_arguments(arguments)
580da578c5e6 Uploaded
rico
parents:
diff changeset
17 sys.exit(to_decorate(options, arguments))
580da578c5e6 Uploaded
rico
parents:
diff changeset
18 return decorated_main
580da578c5e6 Uploaded
rico
parents:
diff changeset
19 return main_decorator
580da578c5e6 Uploaded
rico
parents:
diff changeset
20
580da578c5e6 Uploaded
rico
parents:
diff changeset
21 def parse_arguments(arguments):
580da578c5e6 Uploaded
rico
parents:
diff changeset
22 parser = OptionParser()
580da578c5e6 Uploaded
rico
parents:
diff changeset
23 parser.add_option('--input', dest='input')
580da578c5e6 Uploaded
rico
parents:
diff changeset
24 parser.add_option('--output', dest='output')
580da578c5e6 Uploaded
rico
parents:
diff changeset
25 parser.add_option('--index_dir', dest='index_dir')
580da578c5e6 Uploaded
rico
parents:
diff changeset
26 parser.add_option('--num_snps', dest='num_snps')
580da578c5e6 Uploaded
rico
parents:
diff changeset
27 parser.add_option('--ref_chrom_col', dest='ref_chrom_col')
580da578c5e6 Uploaded
rico
parents:
diff changeset
28 parser.add_option('--ref_pos_col', dest='ref_pos_col')
580da578c5e6 Uploaded
rico
parents:
diff changeset
29 parser.add_option('--ref_species', dest='ref_species')
580da578c5e6 Uploaded
rico
parents:
diff changeset
30 return parser.parse_args(arguments[1:])
580da578c5e6 Uploaded
rico
parents:
diff changeset
31
580da578c5e6 Uploaded
rico
parents:
diff changeset
32 @main_function(parse_arguments)
580da578c5e6 Uploaded
rico
parents:
diff changeset
33 def main(options, arguments):
580da578c5e6 Uploaded
rico
parents:
diff changeset
34
580da578c5e6 Uploaded
rico
parents:
diff changeset
35 ref_chrom_idx = to_int( options.ref_chrom_col ) -1
580da578c5e6 Uploaded
rico
parents:
diff changeset
36 ref_pos_idx = to_int( options.ref_pos_col ) -1
580da578c5e6 Uploaded
rico
parents:
diff changeset
37
580da578c5e6 Uploaded
rico
parents:
diff changeset
38 if (ref_chrom_idx < 1) or (ref_pos_idx < 1) or (ref_chrom_idx == ref_pos_idx):
580da578c5e6 Uploaded
rico
parents:
diff changeset
39 print >> sys.stderr, "Cannot locate reference genome sequence (ref) or reference genome position (rPos) column for this dataset."
580da578c5e6 Uploaded
rico
parents:
diff changeset
40 sys.exit(1)
580da578c5e6 Uploaded
rico
parents:
diff changeset
41
580da578c5e6 Uploaded
rico
parents:
diff changeset
42 chrom_len_root = os.path.join( options.index_dir, 'shared/ucsc/chrom')
580da578c5e6 Uploaded
rico
parents:
diff changeset
43 chrom_len_file = '%s.len' % options.ref_species
580da578c5e6 Uploaded
rico
parents:
diff changeset
44 chrom_len_path = os.path.join(chrom_len_root, chrom_len_file)
580da578c5e6 Uploaded
rico
parents:
diff changeset
45
580da578c5e6 Uploaded
rico
parents:
diff changeset
46 chrlens = gd.ChrLens( chrom_len_path )
580da578c5e6 Uploaded
rico
parents:
diff changeset
47
580da578c5e6 Uploaded
rico
parents:
diff changeset
48 total_len = 0
580da578c5e6 Uploaded
rico
parents:
diff changeset
49 for chrom in chrlens:
580da578c5e6 Uploaded
rico
parents:
diff changeset
50 total_len += chrlens.length(chrom)
580da578c5e6 Uploaded
rico
parents:
diff changeset
51
580da578c5e6 Uploaded
rico
parents:
diff changeset
52 total_requested = int( options.num_snps )
580da578c5e6 Uploaded
rico
parents:
diff changeset
53 lines, data, comments = get_snp_lines_data_and_comments( options.input, ref_chrom_idx, ref_pos_idx )
580da578c5e6 Uploaded
rico
parents:
diff changeset
54 selected = select_snps( data, total_len, total_requested )
580da578c5e6 Uploaded
rico
parents:
diff changeset
55 out_data = fix_selection_and_order_like_input(data, selected, total_requested)
580da578c5e6 Uploaded
rico
parents:
diff changeset
56 write_selected_snps( options.output, out_data, lines, comments )
580da578c5e6 Uploaded
rico
parents:
diff changeset
57
580da578c5e6 Uploaded
rico
parents:
diff changeset
58 def to_int( value ):
580da578c5e6 Uploaded
rico
parents:
diff changeset
59 try:
580da578c5e6 Uploaded
rico
parents:
diff changeset
60 int_value = int( value )
580da578c5e6 Uploaded
rico
parents:
diff changeset
61 except ValueError:
580da578c5e6 Uploaded
rico
parents:
diff changeset
62 int_value = 0
580da578c5e6 Uploaded
rico
parents:
diff changeset
63 return int_value
580da578c5e6 Uploaded
rico
parents:
diff changeset
64
580da578c5e6 Uploaded
rico
parents:
diff changeset
65 def get_snp_lines_data_and_comments( filename, chrom_idx, pos_idx ):
580da578c5e6 Uploaded
rico
parents:
diff changeset
66 fh = open( filename, 'r' )
580da578c5e6 Uploaded
rico
parents:
diff changeset
67 if (chrom_idx >= pos_idx):
580da578c5e6 Uploaded
rico
parents:
diff changeset
68 needed = chrom_idx + 1
580da578c5e6 Uploaded
rico
parents:
diff changeset
69 else:
580da578c5e6 Uploaded
rico
parents:
diff changeset
70 needed = pos_idx + 1
580da578c5e6 Uploaded
rico
parents:
diff changeset
71 lines = []
580da578c5e6 Uploaded
rico
parents:
diff changeset
72 data = []
580da578c5e6 Uploaded
rico
parents:
diff changeset
73 comments = []
580da578c5e6 Uploaded
rico
parents:
diff changeset
74 line_idx = 0
580da578c5e6 Uploaded
rico
parents:
diff changeset
75 line_num = 0
580da578c5e6 Uploaded
rico
parents:
diff changeset
76 for line in fh:
580da578c5e6 Uploaded
rico
parents:
diff changeset
77 line_num += 1
580da578c5e6 Uploaded
rico
parents:
diff changeset
78 line = line.rstrip('\r\n')
580da578c5e6 Uploaded
rico
parents:
diff changeset
79 if line:
580da578c5e6 Uploaded
rico
parents:
diff changeset
80 if line.startswith('#'):
580da578c5e6 Uploaded
rico
parents:
diff changeset
81 comments.append(line)
580da578c5e6 Uploaded
rico
parents:
diff changeset
82 else:
580da578c5e6 Uploaded
rico
parents:
diff changeset
83 elems = line.split('\t')
580da578c5e6 Uploaded
rico
parents:
diff changeset
84 if len(elems) >= needed:
580da578c5e6 Uploaded
rico
parents:
diff changeset
85 chrom = elems[chrom_idx]
580da578c5e6 Uploaded
rico
parents:
diff changeset
86 try:
580da578c5e6 Uploaded
rico
parents:
diff changeset
87 pos = int(elems[pos_idx])
580da578c5e6 Uploaded
rico
parents:
diff changeset
88 except ValueError:
580da578c5e6 Uploaded
rico
parents:
diff changeset
89 sys.stderr.write( "bad reference position in line %d column %d: %s\n" % ( line_num, pos_idx+1, elems[pos_idx] ) )
580da578c5e6 Uploaded
rico
parents:
diff changeset
90 sys.exit(1)
580da578c5e6 Uploaded
rico
parents:
diff changeset
91 lines.append(line)
580da578c5e6 Uploaded
rico
parents:
diff changeset
92 chrom_sort = chrom.lstrip('chr')
580da578c5e6 Uploaded
rico
parents:
diff changeset
93 data.append( [chrom_sort, chrom, pos, line_num, line_idx] )
580da578c5e6 Uploaded
rico
parents:
diff changeset
94 line_idx += 1
580da578c5e6 Uploaded
rico
parents:
diff changeset
95 fh.close()
580da578c5e6 Uploaded
rico
parents:
diff changeset
96 data = sorted( data, key=lambda x: (x[0], x[2]) )
580da578c5e6 Uploaded
rico
parents:
diff changeset
97 return lines, data, comments
580da578c5e6 Uploaded
rico
parents:
diff changeset
98
580da578c5e6 Uploaded
rico
parents:
diff changeset
99 def select_snps( data, total_len, requested ):
580da578c5e6 Uploaded
rico
parents:
diff changeset
100 old_chrom = None
580da578c5e6 Uploaded
rico
parents:
diff changeset
101 next_print = 0
580da578c5e6 Uploaded
rico
parents:
diff changeset
102 selected = []
580da578c5e6 Uploaded
rico
parents:
diff changeset
103 space = total_len / requested
580da578c5e6 Uploaded
rico
parents:
diff changeset
104 for data_idx, datum in enumerate( data ):
580da578c5e6 Uploaded
rico
parents:
diff changeset
105 chrom = datum[1]
580da578c5e6 Uploaded
rico
parents:
diff changeset
106 pos = datum[2]
580da578c5e6 Uploaded
rico
parents:
diff changeset
107 if chrom != old_chrom:
580da578c5e6 Uploaded
rico
parents:
diff changeset
108 old_chrom = chrom
580da578c5e6 Uploaded
rico
parents:
diff changeset
109 next_print = 0
580da578c5e6 Uploaded
rico
parents:
diff changeset
110 if pos >= next_print:
580da578c5e6 Uploaded
rico
parents:
diff changeset
111 selected.append(data_idx)
580da578c5e6 Uploaded
rico
parents:
diff changeset
112 next_print += space
580da578c5e6 Uploaded
rico
parents:
diff changeset
113 return selected
580da578c5e6 Uploaded
rico
parents:
diff changeset
114
580da578c5e6 Uploaded
rico
parents:
diff changeset
115 def fix_selection_and_order_like_input(data, selected, requested):
580da578c5e6 Uploaded
rico
parents:
diff changeset
116 total_selected = len( selected )
580da578c5e6 Uploaded
rico
parents:
diff changeset
117 a = float( total_selected ) / requested
580da578c5e6 Uploaded
rico
parents:
diff changeset
118 b = a / 2
580da578c5e6 Uploaded
rico
parents:
diff changeset
119
580da578c5e6 Uploaded
rico
parents:
diff changeset
120 idx_list = []
580da578c5e6 Uploaded
rico
parents:
diff changeset
121 for i in range( requested ):
580da578c5e6 Uploaded
rico
parents:
diff changeset
122 idx = int( math.ceil( i * a + b ) - 1 )
580da578c5e6 Uploaded
rico
parents:
diff changeset
123 idx_list.append( idx )
580da578c5e6 Uploaded
rico
parents:
diff changeset
124
580da578c5e6 Uploaded
rico
parents:
diff changeset
125 out_data = []
580da578c5e6 Uploaded
rico
parents:
diff changeset
126
580da578c5e6 Uploaded
rico
parents:
diff changeset
127 for i, data_idx in enumerate(selected):
580da578c5e6 Uploaded
rico
parents:
diff changeset
128 if total_selected > requested:
580da578c5e6 Uploaded
rico
parents:
diff changeset
129 if i in idx_list:
580da578c5e6 Uploaded
rico
parents:
diff changeset
130 out_data.append(data[data_idx])
580da578c5e6 Uploaded
rico
parents:
diff changeset
131 else:
580da578c5e6 Uploaded
rico
parents:
diff changeset
132 out_data.append(data[data_idx])
580da578c5e6 Uploaded
rico
parents:
diff changeset
133
580da578c5e6 Uploaded
rico
parents:
diff changeset
134 out_data = sorted( out_data, key=lambda x: x[3] )
580da578c5e6 Uploaded
rico
parents:
diff changeset
135
580da578c5e6 Uploaded
rico
parents:
diff changeset
136 return out_data
580da578c5e6 Uploaded
rico
parents:
diff changeset
137
580da578c5e6 Uploaded
rico
parents:
diff changeset
138 def write_selected_snps( filename, data, lines, comments ):
580da578c5e6 Uploaded
rico
parents:
diff changeset
139 fh = open( filename, 'w' )
580da578c5e6 Uploaded
rico
parents:
diff changeset
140
580da578c5e6 Uploaded
rico
parents:
diff changeset
141 for comment in comments:
580da578c5e6 Uploaded
rico
parents:
diff changeset
142 fh.write("%s\n" % comment )
580da578c5e6 Uploaded
rico
parents:
diff changeset
143
580da578c5e6 Uploaded
rico
parents:
diff changeset
144 for datum in data:
580da578c5e6 Uploaded
rico
parents:
diff changeset
145 line_idx = datum[4]
580da578c5e6 Uploaded
rico
parents:
diff changeset
146 fh.write("%s\n" % lines[line_idx])
580da578c5e6 Uploaded
rico
parents:
diff changeset
147
580da578c5e6 Uploaded
rico
parents:
diff changeset
148 fh.close()
580da578c5e6 Uploaded
rico
parents:
diff changeset
149
580da578c5e6 Uploaded
rico
parents:
diff changeset
150 if __name__ == "__main__":
580da578c5e6 Uploaded
rico
parents:
diff changeset
151 main()
580da578c5e6 Uploaded
rico
parents:
diff changeset
152
580da578c5e6 Uploaded
rico
parents:
diff changeset
153